EAddario
diff --git a/‎.github/labeler.yml‎
Lines changed: 8 additions & 0 deletions b/‎.github/labeler.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎.github/workflows/release.yml‎
Lines changed: 27 additions & 59 deletions b/‎.github/workflows/release.yml‎
Lines changed: 27 additions & 59 deletions
diff --git a/‎cmake/arm64-linux-clang.cmake‎
Lines changed: 17 additions & 0 deletions b/‎cmake/arm64-linux-clang.cmake‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎common/arg.cpp‎
Lines changed: 19 additions & 14 deletions b/‎common/arg.cpp‎
Lines changed: 19 additions & 14 deletions
diff --git a/‎common/chat-auto-parser-generator.cpp‎
Lines changed: 18 additions & 36 deletions b/‎common/chat-auto-parser-generator.cpp‎
Lines changed: 18 additions & 36 deletions
@@ -73,10 +73,18 @@ android:
     - changed-files:
         - any-glob-to-any-file:
             - examples/llama.android/**
+server/webui:
+    - changed-files:
+        - any-glob-to-any-file:
+            - tools/server/webui/**
+            - tools/server/public/**
 server:
     - changed-files:
         - any-glob-to-any-file:
             - tools/server/**
+
+
+
 ggml:
     - changed-files:
         - any-glob-to-any-file:
 
@@ -36,55 +36,26 @@ env:
   CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
 
 jobs:
-  macOS-arm64:
-    runs-on: macos-14
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: macOS-latest-arm64
-          evict-old-files: 1d
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build \
-            -DCMAKE_INSTALL_RPATH='@loader_path' \
-            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DGGML_RPC=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
-          name: llama-bin-macos-arm64.tar.gz
+  macOS-cpu:
+    strategy:
+      matrix:
+        include:
+          - build: 'arm64'
+            arch: 'arm64'
+            os: macos-14
+            defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON"
+          - build: 'arm64-kleidiai'
+            arch: 'arm64'
+            os: macos-14
+            defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
+          - build: 'x64'
+            arch: 'x64'
+            os: macos-15-intel
+            # Metal is disabled on x64 due to intermittent failures with Github runners not having a GPU:
+            # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+            defines: "-DGGML_METAL=OFF -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3"
 
-  macOS-x64:
-    runs-on: macos-15-intel
+    runs-on: ${{ matrix.os }}
 
     steps:
       - name: Clone
@@ -96,23 +67,20 @@ jobs:
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.21
         with:
-          key: macOS-latest-x64
+          key: macOS-latest-${{ matrix.arch }}
           evict-old-files: 1d
 
       - name: Build
         id: cmake_build
         run: |
           sysctl -a
-          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
           cmake -B build \
+            ${{ matrix.defines }} \
             -DCMAKE_INSTALL_RPATH='@loader_path' \
             -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
             -DLLAMA_FATAL_WARNINGS=ON \
             -DLLAMA_BUILD_BORINGSSL=ON \
-            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
+            ${{ env.CMAKE_ARGS }}
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Determine tag name
@@ -123,13 +91,13 @@ jobs:
         id: pack_artifacts
         run: |
           cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v6
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
-          name: llama-bin-macos-x64.tar.gz
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz
+          name: llama-bin-macos-${{ matrix.build }}.tar.gz
 
   ubuntu-cpu:
     strategy:
@@ -1003,8 +971,7 @@ jobs:
       - ubuntu-cpu
       - ubuntu-vulkan
       - ubuntu-24-openvino
-      - macOS-arm64
-      - macOS-x64
+      - macOS-cpu
       - ios-xcode-build
       - openEuler-cann
 
@@ -1079,6 +1046,7 @@ jobs:
 
             **macOS/iOS:**
             - [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
+            - [macOS Apple Silicon (arm64, KleidiAI enabled)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64-kleidiai.tar.gz)
             - [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
             - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)
 
 
@@ -0,0 +1,17 @@
+set( CMAKE_SYSTEM_NAME Linux )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target aarch64-linux-gnu )
+
+set( CMAKE_C_COMPILER    clang )
+set( CMAKE_CXX_COMPILER  clang++ )
+
+set( CMAKE_C_COMPILER_TARGET   ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
+
+set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
+
+set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
+
@@ -291,14 +291,16 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
         hf_tag = "default";
     }
 
-    const bool offline = params.offline;
     std::string model_endpoint = get_model_endpoint();
     auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
 
     // prepare local path for caching
     auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
     auto preset_path = fs_get_cache_file(preset_fname);
-    const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
+    common_download_opts opts;
+    opts.bearer_token = params.hf_token;
+    opts.offline = params.offline;
+    const int status = common_download_file_single(preset_url, preset_path, opts);
     const bool has_preset = status >= 200 && status < 400;
 
     // remote preset is optional, so we don't error out if not found
@@ -341,10 +343,10 @@ static handle_model_result common_params_handle_model(struct common_params_model
             model.hf_file = model.path;
             model.path = "";
         }
-        common_download_model_opts opts;
-        opts.download_mmproj = true;
+        common_download_opts opts;
+        opts.bearer_token = bearer_token;
         opts.offline = offline;
-        auto download_result = common_download_model(model, bearer_token, opts);
+        auto download_result = common_download_model(model, opts, true);
 
         if (download_result.model_path.empty()) {
             LOG_ERR("error: failed to download model from Hugging Face\n");
@@ -365,9 +367,10 @@ static handle_model_result common_params_handle_model(struct common_params_model
             model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
         }
 
-        common_download_model_opts opts;
+        common_download_opts opts;
+        opts.bearer_token = bearer_token;
         opts.offline = offline;
-        auto download_result = common_download_model(model, bearer_token, opts);
+        auto download_result = common_download_model(model, opts);
         if (download_result.model_path.empty()) {
             LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
             exit(1);
@@ -2348,19 +2351,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
     add_opt(common_arg(
-        {"-sm", "--split-mode"}, "{none,layer,row}",
+        {"-sm", "--split-mode"}, "{none,layer,row,tensor}",
         "how to split the model across multiple GPUs, one of:\n"
         "- none: use one GPU only\n"
-        "- layer (default): split layers and KV across GPUs\n"
-        "- row: split rows across GPUs",
+        "- layer (default): split layers and KV across GPUs (pipelined)\n"
+        "- row: split weight across GPUs by rows (parallelized)\n"
+        "- tensor: split weights and KV across GPUs (parallelized, EXPERIMENTAL)",
         [](common_params & params, const std::string & value) {
-            std::string arg_next = value;
-            if (arg_next == "none") {
+            if (value == "none") {
                 params.split_mode = LLAMA_SPLIT_MODE_NONE;
-            } else if (arg_next == "layer") {
+            } else if (value == "layer") {
                 params.split_mode = LLAMA_SPLIT_MODE_LAYER;
-            } else if (arg_next == "row") {
+            } else if (value == "row") {
                 params.split_mode = LLAMA_SPLIT_MODE_ROW;
+            } else if (value == "tensor") {
+                params.split_mode = LLAMA_SPLIT_MODE_TENSOR;
             } else {
                 throw std::invalid_argument("invalid value");
             }
 
@@ -69,6 +69,10 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
                 auto         schema   = function.contains("parameters") ? function.at("parameters") : json::object();
                 builder.resolve_refs(schema);
             });
+            if (has_response_format) {
+                auto schema = inputs.json_schema;
+                builder.resolve_refs(schema);
+            }
             parser.build_grammar(builder, data.grammar_lazy);
         });
 
@@ -332,58 +336,36 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
     const auto & inputs      = ctx.inputs;
     bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
 
+    auto until_suffix = p.rule("until-suffix", p.until(arguments.value_suffix));
+
     common_peg_parser tool_choice = p.choice();
 
     foreach_function(inputs.tools, [&](const json & tool) {
         const auto &          func       = tool.at("function");
         std::string           name       = func.at("name");
-        const auto &          params     = func.contains("parameters") ? func.at("parameters") : json::object();
+        auto                  params     = func.contains("parameters") ? func.at("parameters") : json::object();
         const auto &          properties = params.contains("properties") ? params.at("properties") : json::object();
+
         std::set<std::string> required;
+        if (params.contains("required")) {
+            params.at("required").get_to(required);
+        }
+
+        auto schema_info = common_schema_info();
+        schema_info.resolve_refs(params);
 
         // Build parser for each argument, separating required and optional
         std::vector<common_peg_parser> required_parsers;
         std::vector<common_peg_parser> optional_parsers;
         for (const auto & [param_name, param_schema] : properties.items()) {
-            bool        is_required = required.find(param_name) != required.end();
-            std::string type        = "object";
-            if (param_schema.contains("type")) {
-                const auto & type_obj = param_schema.at("type");
-                if (type_obj.is_string()) {
-                    type_obj.get_to(type);
-                } else if (type_obj.is_array()) {
-                    // Handle nullable types like ["string", "null"]
-                    for (const auto & t : type_obj) {
-                        if (t.is_string() && t.get<std::string>() != "null") {
-                            type = t.get<std::string>();
-                            break;
-                        }
-                    }
-                } else if (type_obj.is_object()) {
-                    if (type_obj.contains("type") && type_obj.at("type").is_string()) {
-                        type_obj.at("type").get_to(type);
-                    }
-                }
-            }
-            // Infer string type from enum values when type is unspecified
-            if (type == "object" && param_schema.contains("enum")) {
-                const auto & enum_vals = param_schema.at("enum");
-                if (enum_vals.is_array()) {
-                    for (const auto & v : enum_vals) {
-                        if (v.is_string()) {
-                            type = "string";
-                            break;
-                        }
-                    }
-                }
-            }
+            bool is_required = required.find(param_name) != required.end();
 
             auto arg =
                 p.tool_arg(p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
                                            arguments.name_suffix) +
                            arguments.value_prefix +
-                           (type == "string" ?
-                                p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
+                           (schema_info.resolves_to_string(param_schema) ?
+                                p.tool_arg_string_value(p.schema(until_suffix,
                                                                  "tool-" + name + "-arg-" + param_name + "-schema",
                                                                  param_schema, true)) :
                                 p.tool_arg_json_value(p.schema(
@@ -414,7 +396,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
             for (const auto & opt : optional_parsers) {
                 any_opt |= opt;
             }
-            args_seq = args_seq + p.repeat(p.space() + any_opt, 0, (int) optional_parsers.size());
+            args_seq = args_seq + p.repeat(p.space() + any_opt, 0, -1);
         }
 
         if (!arguments.start.empty()) {