Skip to content

Commit 07ef9ea

Browse files
committed
Merge branch 'master' into quantize
2 parents 67bef17 + af1127d commit 07ef9ea

320 files changed

Lines changed: 24608 additions & 18082 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/labeler.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,18 @@ android:
7373
- changed-files:
7474
- any-glob-to-any-file:
7575
- examples/llama.android/**
76+
server/webui:
77+
- changed-files:
78+
- any-glob-to-any-file:
79+
- tools/server/webui/**
80+
- tools/server/public/**
7681
server:
7782
- changed-files:
7883
- any-glob-to-any-file:
7984
- tools/server/**
85+
86+
87+
8088
ggml:
8189
- changed-files:
8290
- any-glob-to-any-file:

.github/workflows/release.yml

Lines changed: 27 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -36,55 +36,26 @@ env:
3636
CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
3737

3838
jobs:
39-
macOS-arm64:
40-
runs-on: macos-14
41-
42-
steps:
43-
- name: Clone
44-
id: checkout
45-
uses: actions/checkout@v6
46-
with:
47-
fetch-depth: 0
48-
49-
- name: ccache
50-
uses: ggml-org/ccache-action@v1.2.21
51-
with:
52-
key: macOS-latest-arm64
53-
evict-old-files: 1d
54-
55-
- name: Build
56-
id: cmake_build
57-
run: |
58-
sysctl -a
59-
cmake -B build \
60-
-DCMAKE_INSTALL_RPATH='@loader_path' \
61-
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
62-
-DLLAMA_FATAL_WARNINGS=ON \
63-
-DLLAMA_BUILD_BORINGSSL=ON \
64-
-DGGML_METAL_USE_BF16=ON \
65-
-DGGML_METAL_EMBED_LIBRARY=ON \
66-
-DGGML_RPC=ON \
67-
${{ env.CMAKE_ARGS }}
68-
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
69-
70-
- name: Determine tag name
71-
id: tag
72-
uses: ./.github/actions/get-tag-name
73-
74-
- name: Pack artifacts
75-
id: pack_artifacts
76-
run: |
77-
cp LICENSE ./build/bin/
78-
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
79-
80-
- name: Upload artifacts
81-
uses: actions/upload-artifact@v6
82-
with:
83-
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
84-
name: llama-bin-macos-arm64.tar.gz
39+
macOS-cpu:
40+
strategy:
41+
matrix:
42+
include:
43+
- build: 'arm64'
44+
arch: 'arm64'
45+
os: macos-14
46+
defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON"
47+
- build: 'arm64-kleidiai'
48+
arch: 'arm64'
49+
os: macos-14
50+
defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
51+
- build: 'x64'
52+
arch: 'x64'
53+
os: macos-15-intel
54+
# Metal is disabled on x64 due to intermittent failures with Github runners not having a GPU:
55+
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
56+
defines: "-DGGML_METAL=OFF -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3"
8557

86-
macOS-x64:
87-
runs-on: macos-15-intel
58+
runs-on: ${{ matrix.os }}
8859

8960
steps:
9061
- name: Clone
@@ -96,23 +67,20 @@ jobs:
9667
- name: ccache
9768
uses: ggml-org/ccache-action@v1.2.21
9869
with:
99-
key: macOS-latest-x64
70+
key: macOS-latest-${{ matrix.arch }}
10071
evict-old-files: 1d
10172

10273
- name: Build
10374
id: cmake_build
10475
run: |
10576
sysctl -a
106-
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
107-
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
10877
cmake -B build \
78+
${{ matrix.defines }} \
10979
-DCMAKE_INSTALL_RPATH='@loader_path' \
11080
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
11181
-DLLAMA_FATAL_WARNINGS=ON \
11282
-DLLAMA_BUILD_BORINGSSL=ON \
113-
-DGGML_METAL=OFF \
114-
-DGGML_RPC=ON \
115-
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
83+
${{ env.CMAKE_ARGS }}
11684
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
11785
11886
- name: Determine tag name
@@ -123,13 +91,13 @@ jobs:
12391
id: pack_artifacts
12492
run: |
12593
cp LICENSE ./build/bin/
126-
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
94+
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
12795
12896
- name: Upload artifacts
12997
uses: actions/upload-artifact@v6
13098
with:
131-
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
132-
name: llama-bin-macos-x64.tar.gz
99+
path: llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz
100+
name: llama-bin-macos-${{ matrix.build }}.tar.gz
133101

134102
ubuntu-cpu:
135103
strategy:
@@ -1003,8 +971,7 @@ jobs:
1003971
- ubuntu-cpu
1004972
- ubuntu-vulkan
1005973
- ubuntu-24-openvino
1006-
- macOS-arm64
1007-
- macOS-x64
974+
- macOS-cpu
1008975
- ios-xcode-build
1009976
- openEuler-cann
1010977

@@ -1079,6 +1046,7 @@ jobs:
10791046
10801047
**macOS/iOS:**
10811048
- [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
1049+
- [macOS Apple Silicon (arm64, KleidiAI enabled)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64-kleidiai.tar.gz)
10821050
- [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
10831051
- [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)
10841052

cmake/arm64-linux-clang.cmake

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
set( CMAKE_SYSTEM_NAME Linux )
2+
set( CMAKE_SYSTEM_PROCESSOR arm64 )
3+
4+
set( target aarch64-linux-gnu )
5+
6+
set( CMAKE_C_COMPILER clang )
7+
set( CMAKE_CXX_COMPILER clang++ )
8+
9+
set( CMAKE_C_COMPILER_TARGET ${target} )
10+
set( CMAKE_CXX_COMPILER_TARGET ${target} )
11+
12+
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
13+
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
14+
15+
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
16+
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
17+

common/arg.cpp

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -291,14 +291,16 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
291291
hf_tag = "default";
292292
}
293293

294-
const bool offline = params.offline;
295294
std::string model_endpoint = get_model_endpoint();
296295
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
297296

298297
// prepare local path for caching
299298
auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
300299
auto preset_path = fs_get_cache_file(preset_fname);
301-
const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
300+
common_download_opts opts;
301+
opts.bearer_token = params.hf_token;
302+
opts.offline = params.offline;
303+
const int status = common_download_file_single(preset_url, preset_path, opts);
302304
const bool has_preset = status >= 200 && status < 400;
303305

304306
// remote preset is optional, so we don't error out if not found
@@ -341,10 +343,10 @@ static handle_model_result common_params_handle_model(struct common_params_model
341343
model.hf_file = model.path;
342344
model.path = "";
343345
}
344-
common_download_model_opts opts;
345-
opts.download_mmproj = true;
346+
common_download_opts opts;
347+
opts.bearer_token = bearer_token;
346348
opts.offline = offline;
347-
auto download_result = common_download_model(model, bearer_token, opts);
349+
auto download_result = common_download_model(model, opts, true);
348350

349351
if (download_result.model_path.empty()) {
350352
LOG_ERR("error: failed to download model from Hugging Face\n");
@@ -365,9 +367,10 @@ static handle_model_result common_params_handle_model(struct common_params_model
365367
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
366368
}
367369

368-
common_download_model_opts opts;
370+
common_download_opts opts;
371+
opts.bearer_token = bearer_token;
369372
opts.offline = offline;
370-
auto download_result = common_download_model(model, bearer_token, opts);
373+
auto download_result = common_download_model(model, opts);
371374
if (download_result.model_path.empty()) {
372375
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
373376
exit(1);
@@ -2348,19 +2351,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23482351
}
23492352
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
23502353
add_opt(common_arg(
2351-
{"-sm", "--split-mode"}, "{none,layer,row}",
2354+
{"-sm", "--split-mode"}, "{none,layer,row,tensor}",
23522355
"how to split the model across multiple GPUs, one of:\n"
23532356
"- none: use one GPU only\n"
2354-
"- layer (default): split layers and KV across GPUs\n"
2355-
"- row: split rows across GPUs",
2357+
"- layer (default): split layers and KV across GPUs (pipelined)\n"
2358+
"- row: split weight across GPUs by rows (parallelized)\n"
2359+
"- tensor: split weights and KV across GPUs (parallelized, EXPERIMENTAL)",
23562360
[](common_params & params, const std::string & value) {
2357-
std::string arg_next = value;
2358-
if (arg_next == "none") {
2361+
if (value == "none") {
23592362
params.split_mode = LLAMA_SPLIT_MODE_NONE;
2360-
} else if (arg_next == "layer") {
2363+
} else if (value == "layer") {
23612364
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
2362-
} else if (arg_next == "row") {
2365+
} else if (value == "row") {
23632366
params.split_mode = LLAMA_SPLIT_MODE_ROW;
2367+
} else if (value == "tensor") {
2368+
params.split_mode = LLAMA_SPLIT_MODE_TENSOR;
23642369
} else {
23652370
throw std::invalid_argument("invalid value");
23662371
}

common/chat-auto-parser-generator.cpp

Lines changed: 18 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
6969
auto schema = function.contains("parameters") ? function.at("parameters") : json::object();
7070
builder.resolve_refs(schema);
7171
});
72+
if (has_response_format) {
73+
auto schema = inputs.json_schema;
74+
builder.resolve_refs(schema);
75+
}
7276
parser.build_grammar(builder, data.grammar_lazy);
7377
});
7478

@@ -332,58 +336,36 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
332336
const auto & inputs = ctx.inputs;
333337
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
334338

339+
auto until_suffix = p.rule("until-suffix", p.until(arguments.value_suffix));
340+
335341
common_peg_parser tool_choice = p.choice();
336342

337343
foreach_function(inputs.tools, [&](const json & tool) {
338344
const auto & func = tool.at("function");
339345
std::string name = func.at("name");
340-
const auto & params = func.contains("parameters") ? func.at("parameters") : json::object();
346+
auto params = func.contains("parameters") ? func.at("parameters") : json::object();
341347
const auto & properties = params.contains("properties") ? params.at("properties") : json::object();
348+
342349
std::set<std::string> required;
350+
if (params.contains("required")) {
351+
params.at("required").get_to(required);
352+
}
353+
354+
auto schema_info = common_schema_info();
355+
schema_info.resolve_refs(params);
343356

344357
// Build parser for each argument, separating required and optional
345358
std::vector<common_peg_parser> required_parsers;
346359
std::vector<common_peg_parser> optional_parsers;
347360
for (const auto & [param_name, param_schema] : properties.items()) {
348-
bool is_required = required.find(param_name) != required.end();
349-
std::string type = "object";
350-
if (param_schema.contains("type")) {
351-
const auto & type_obj = param_schema.at("type");
352-
if (type_obj.is_string()) {
353-
type_obj.get_to(type);
354-
} else if (type_obj.is_array()) {
355-
// Handle nullable types like ["string", "null"]
356-
for (const auto & t : type_obj) {
357-
if (t.is_string() && t.get<std::string>() != "null") {
358-
type = t.get<std::string>();
359-
break;
360-
}
361-
}
362-
} else if (type_obj.is_object()) {
363-
if (type_obj.contains("type") && type_obj.at("type").is_string()) {
364-
type_obj.at("type").get_to(type);
365-
}
366-
}
367-
}
368-
// Infer string type from enum values when type is unspecified
369-
if (type == "object" && param_schema.contains("enum")) {
370-
const auto & enum_vals = param_schema.at("enum");
371-
if (enum_vals.is_array()) {
372-
for (const auto & v : enum_vals) {
373-
if (v.is_string()) {
374-
type = "string";
375-
break;
376-
}
377-
}
378-
}
379-
}
361+
bool is_required = required.find(param_name) != required.end();
380362

381363
auto arg =
382364
p.tool_arg(p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
383365
arguments.name_suffix) +
384366
arguments.value_prefix +
385-
(type == "string" ?
386-
p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
367+
(schema_info.resolves_to_string(param_schema) ?
368+
p.tool_arg_string_value(p.schema(until_suffix,
387369
"tool-" + name + "-arg-" + param_name + "-schema",
388370
param_schema, true)) :
389371
p.tool_arg_json_value(p.schema(
@@ -414,7 +396,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
414396
for (const auto & opt : optional_parsers) {
415397
any_opt |= opt;
416398
}
417-
args_seq = args_seq + p.repeat(p.space() + any_opt, 0, (int) optional_parsers.size());
399+
args_seq = args_seq + p.repeat(p.space() + any_opt, 0, -1);
418400
}
419401

420402
if (!arguments.start.empty()) {

0 commit comments

Comments
 (0)