pytorch
diff --git a/‎backends/vulkan/test/op_tests/dequantize_test.cpp‎
Lines changed: 24 additions & 14 deletions b/‎backends/vulkan/test/op_tests/dequantize_test.cpp‎
Lines changed: 24 additions & 14 deletions
diff --git a/‎examples/models/llama/TARGETS‎
Lines changed: 1 addition & 2 deletions b/‎examples/models/llama/TARGETS‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/models/llama/config/llm_config.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/models/llama/config/llm_config.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/models/llama/export_llama.py‎
Lines changed: 9 additions & 7 deletions b/‎examples/models/llama/export_llama.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎examples/models/llama/export_llama_args.py‎
Lines changed: 0 additions & 21 deletions b/‎examples/models/llama/export_llama_args.py‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎examples/models/llama/export_llama_hydra.py‎
Lines changed: 0 additions & 28 deletions b/‎examples/models/llama/export_llama_hydra.py‎
Lines changed: 0 additions & 28 deletions
diff --git a/‎extension/llm/export/TARGETS‎
Lines changed: 35 additions & 0 deletions b/‎extension/llm/export/TARGETS‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎extension/llm/export/builder.py‎
Lines changed: 21 additions & 3 deletions b/‎extension/llm/export/builder.py‎
Lines changed: 21 additions & 3 deletions
diff --git a/‎extension/llm/export/export_llm.py‎
Lines changed: 45 additions & 0 deletions b/‎extension/llm/export/export_llm.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎extension/llm/export/test/test_builder.py‎
Lines changed: 1 addition & 1 deletion b/‎extension/llm/export/test/test_builder.py‎
Lines changed: 1 addition & 1 deletion
@@ -807,14 +807,19 @@ TEST(
 
 TEST(
     VulkanDequantizePerTensorTest,
-    test_vulkan_dequantize_per_tensor_int32_to_double) {
+    test_vulkan_dequantize_per_tensor_int8_to_double) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
   test_vulkan_dequantize_per_tensor(
-      {2, 4, 3}, // input sizes
-      0.0001, // scale
-      100, // zero_point
-      -2147483648, // quant_min
-      2147483647, // quant_max
-      at::kInt, // input dtype
+      {2, 3}, // input sizes
+      0.05, // scale
+      10, // zero_point
+      -128, // quant_min
+      127, // quant_max
+      at::kChar, // input dtype
       at::kDouble); // output dtype
 }
 
@@ -1316,16 +1321,21 @@ TEST(
 
 TEST(
     VulkanDequantizePerTokenTest,
-    test_vulkan_dequantize_per_token_int32_to_double) {
-  std::vector<float> scales = {0.0001, 0.0002, 0.0003, 0.0};
-  std::vector<int> zero_points = {100, -100, 50, -50};
+    test_vulkan_dequantize_per_token_int8_to_double) {
+  if (!vkcompute::api::context()
+           ->adapter_ptr()
+           ->has_full_int8_buffers_support()) {
+    GTEST_SKIP();
+  }
+  std::vector<float> scales = {0.05, 0.001};
+  std::vector<int> zero_points = {10, -5};
 
   test_vulkan_dequantize_per_token(
-      {2, 2, 8}, // input sizes (2*2=4 tokens)
+      {2, 2}, // input sizes (2 tokens)
       scales,
       zero_points,
-      -2147483648, // quant_min
-      2147483647, // quant_max
-      at::kInt, // input dtype
+      -128, // quant_min
+      127, // quant_max
+      at::kChar, // input dtype
       at::kDouble); // output dtype
 }
@@ -85,6 +85,7 @@ runtime.python_binary(
         ":export_library",
         "//caffe2:torch",
         "//executorch/extension/pybindings:aten_lib",
+        "//executorch/extension/llm/export:export_llm_lib",
     ],
 )
 
@@ -133,8 +134,6 @@ runtime.python_library(
     name = "export_library",
     srcs = [
         "export_llama.py",
-        "export_llama_args.py",
-        "export_llama_hydra.py",
         "export_llama_lib.py",
         "model.py",
     ],
 
@@ -86,7 +86,7 @@ class BaseConfig:
     checkpoint_dir: Optional[str] = None
     tokenizer_path: Optional[str] = None
     metadata: Optional[str] = None
-    use_lora: int = int
+    use_lora: int = 0
     fairseq2: bool = False
     preq_mode: Optional[PreqMode] = None
     preq_group_size: int = 32
@@ -214,7 +214,7 @@ class ExportConfig:
 
     max_seq_length: int = 128
     max_context_length: int = 128
-    output_dir: Optional[str] = None
+    output_dir: str = "."
     output_name: Optional[str] = None
     so_library: Optional[str] = None
     export_only: bool = False
 
@@ -17,6 +17,11 @@
 
 import torch
 
+from executorch.examples.models.llama.export_llama_lib import (
+    build_args_parser,
+    export_llama,
+)
+
 sys.setrecursionlimit(4096)
 
 
@@ -39,15 +44,12 @@ def main() -> None:
         sys.argv = [arg for arg in sys.argv if arg != "--hydra"]
         print(f"running with {sys.argv}")
         runpy.run_module(
-            "executorch.examples.models.llama.export_llama_hydra", run_name="__main__"
+            "executorch.extension.llm.export.export_llm", run_name="__main__"
         )
     else:
-        # Use the legacy version of the export_llama script which uses argsparse.
-        from executorch.examples.models.llama.export_llama_args import (
-            main as export_llama_args_main,
-        )
-
-        export_llama_args_main(remaining_args)
+        parser = build_args_parser()
+        remaining_args = parser.parse_args(remaining_args)
+        export_llama(remaining_args)
 
 
 if __name__ == "__main__":
 
@@ -47,6 +47,41 @@ runtime.python_library(
     ],
 )
 
+runtime.python_binary(
+    name = "export_llm",
+    srcs = [
+        "export_llm.py",
+    ],
+    main_function = "executorch.extension.llm.export.export_llm.main",
+    preload_deps = [
+        "//executorch/extension/llm/custom_ops:model_sharding_py",
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+        "//executorch/kernels/quantized:aot_lib",
+    ],
+    deps = [
+        "fbsource//third-party/pypi/hydra-core:hydra-core",
+        "fbsource//third-party/pypi/omegaconf:omegaconf",
+        "//executorch/examples/models/llama:export_library",
+        "//executorch/extension/pybindings:aten_lib",
+    ],
+)
+
+runtime.python_library(
+    name = "export_llm_lib",
+    srcs = [
+        "export_llm.py",
+    ],
+    deps = [
+        "fbsource//third-party/pypi/hydra-core:hydra-core",
+        "fbsource//third-party/pypi/omegaconf:omegaconf",
+        "//executorch/examples/models/llama:export_library",
+    ],
+    visibility = [
+        "//executorch/examples/...",
+        "//executorch/extension/llm/...",
+    ],
+)
+
 runtime.python_test(
     name = "export_passes_test",
     srcs = [
 
@@ -133,6 +133,19 @@ def __init__(
         self.output_dir = "."
         self._saved_pte_filename = None
 
+    def __post_init__(self):
+        """
+        Post init function to update metadata based on dynamic shape
+        """
+        dynamic_shape = self._get_dynamic_shape()
+        if dynamic_shape is not None:
+            token_dim = dynamic_shape[0][1]
+            if self.verbose:
+                logging.info(
+                    f"Metadata 'get_max_seq_len' is being updated to match torch.export's dynamic shape max: {token_dim.max}"
+                )
+            self.metadata["get_max_seq_len"] = token_dim.max
+
     def set_output_dir(self, output_dir: str) -> "LLMEdgeManager":
         """
         Set the directory where the .pte file will be saved.
@@ -180,14 +193,19 @@ def _get_dynamic_shape(self) -> Any:
         if self.dynamic_shapes:
             return self.dynamic_shapes
 
-        dim = torch.export.Dim("token_dim", max=self.max_seq_len - 1)
         if self.enable_dynamic_shape:
             if not self.use_kv_cache:
                 # Only one input argument: tokens
-                self.dynamic_shapes = ({1: dim},)
+                # Here we -1 due to export limitation: https://gist.github.com/larryliu0820/419022a57e24d5e64150e325a685eaad
+                self.dynamic_shapes = (
+                    {1: torch.export.Dim("token_dim", max=self.max_seq_len - 1)},
+                )
             else:
                 # Two input arguments: tokens and input_pos but input_pos is static shape
-                self.dynamic_shapes = ({1: dim}, {"input_pos": {0: 1}})
+                self.dynamic_shapes = (
+                    {1: torch.export.Dim("token_dim", max=self.max_seq_len)},
+                    {"input_pos": {0: 1}},
+                )
         else:
             # Two input arguments: tokens and input_pos but both are of static shape
             self.dynamic_shapes = None
 
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Export an LLM with ExecuTorch. Currently follows the following steps:
+1. Instantiate our custom PyTorch transformer definition from examples/llama/models/llama_transformer.py.
+2. Load weights into the model.
+3. Apply source transformations/TorchAO quantization.
+4. Export model to intermediate IRs.
+5. Graph transformations/PT2E quantization.
+6. Partition graph and delegate to backend(s).
+7. Export to final ExecuTorch .pte format.
+
+Example usage using full CLI arguments:
+python -m extension.llm.export.export_llm \
+    base.model_class="llama3" \
+    model.use_sdpa_with_kv_cache=True \
+    model.use_kv_cache=True \
+    debug.verbose=True \
+    backend.xnnpack.enabled=True \
+    backend.xnnpack.extended_ops=True \
+    quantization.qmode="8da4w"
+"""
+
+import hydra
+
+from executorch.examples.models.llama.config.llm_config import LlmConfig
+from executorch.examples.models.llama.export_llama_lib import export_llama
+from hydra.core.config_store import ConfigStore
+from omegaconf import OmegaConf
+
+cs = ConfigStore.instance()
+cs.store(name="llm_config", node=LlmConfig)
+
+
+@hydra.main(version_base=None, config_path=None, config_name="llm_config")
+def main(llm_config: LlmConfig) -> None:
+    export_llama(OmegaConf.to_object(llm_config))
+
+
+if __name__ == "__main__":
+    main()
@@ -88,7 +88,7 @@ def test_get_dynamic_shape_with_dynamic_shape_enabled_with_kv_cache(self) -> Non
         # Check first element (tokens dimension)
         self.assertIsInstance(result[0], dict)
         self.assertIn(1, result[0])
-        self.assertEqual(result[0][1].max, self.max_seq_len - 1)
+        self.assertEqual(result[0][1].max, self.max_seq_len)
 
         # Check second element (input_pos dimension)
         self.assertIsInstance(result[1], dict)