Skip to content

Commit 44a6721

Browse files
committed
Merge branch 'master' into imatrix
2 parents fcba499 + beb1f0c commit 44a6721

160 files changed

Lines changed: 18106 additions & 9076 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.devops/vulkan.Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ WORKDIR /app
5050

5151
RUN apt-get update \
5252
&& apt-get install -y \
53+
build-essential \
5354
git \
5455
python3 \
5556
python3-pip \

CODEOWNERS

Lines changed: 8 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,8 @@
22
# multiplie collaborators per item can be specified
33

44
/.devops/*.Dockerfile @ngxson
5-
/.github/actions/ @slaren @CISC
5+
/.github/actions/ @CISC
66
/.github/workflows/ @CISC
7-
/.github/workflows/release.yml @slaren
8-
/.github/workflows/winget.yml @slaren
97
/ci/ @ggerganov
108
/cmake/ @ggerganov
119
/common/CMakeLists.txt @ggerganov
@@ -40,41 +38,34 @@
4038
/examples/passkey/ @ggerganov
4139
/examples/retrieval/ @ggerganov
4240
/examples/save-load-state/ @ggerganov
43-
/examples/simple-chat/ @slaren
44-
/examples/simple/ @slaren
4541
/examples/speculative-simple/ @ggerganov
4642
/examples/speculative/ @ggerganov
4743
/ggml/cmake/ @ggerganov
48-
/ggml/include/ @ggerganov @slaren
49-
/ggml/src/ggml-alloc.c @slaren
50-
/ggml/src/ggml-backend* @slaren
51-
/ggml/src/ggml-blas/ @slaren
52-
/ggml/src/ggml-common.h @ggerganov @slaren
53-
/ggml/src/ggml-cpu/ @ggerganov @slaren
44+
/ggml/include/ @ggerganov
45+
/ggml/src/ggml-common.h @ggerganov
46+
/ggml/src/ggml-cpu/ @ggerganov
5447
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
55-
/ggml/src/ggml-cuda/common.cuh @slaren
5648
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
57-
/ggml/src/ggml-cuda/ggml-cuda.cu @slaren
5849
/ggml/src/ggml-cuda/mmf.* @JohannesGaessler @am17an
5950
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
6051
/ggml/src/ggml-cuda/mmvf.* @JohannesGaessler
6152
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
6253
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
6354
/ggml/src/ggml-hip/ @IMbackK
6455
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
65-
/ggml/src/ggml-impl.h @ggerganov @slaren
56+
/ggml/src/ggml-impl.h @ggerganov
6657
/ggml/src/ggml-metal/ @ggerganov
6758
/ggml/src/ggml-opencl/ @lhez @max-krasnyansky
6859
/ggml/src/ggml-hexagon/ @max-krasnyansky @lhez
6960
/ggml/src/ggml-opt.cpp @JohannesGaessler
7061
/ggml/src/ggml-quants.* @ggerganov
7162
/ggml/src/ggml-rpc/ @rgerganov
72-
/ggml/src/ggml-threading.* @ggerganov @slaren
63+
/ggml/src/ggml-threading.* @ggerganov
7364
/ggml/src/ggml-vulkan/ @0cc4m
7465
/ggml/src/ggml-webgpu/ @reeselevine
7566
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
76-
/ggml/src/ggml.c @ggerganov @slaren
77-
/ggml/src/ggml.cpp @ggerganov @slaren
67+
/ggml/src/ggml.c @ggerganov
68+
/ggml/src/ggml.cpp @ggerganov
7869
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
7970
/gguf-py/ @CISC
8071
/media/ @ggerganov
@@ -86,15 +77,11 @@
8677
/src/llama-arch.* @CISC
8778
/src/llama-chat.* @ngxson
8879
/src/llama-graph.* @CISC
89-
/src/llama-model-loader.* @slaren
9080
/src/llama-model.* @CISC
9181
/src/llama-vocab.* @CISC
9282
/src/models/ @CISC
9383
/tests/ @ggerganov
94-
/tests/test-backend-ops.cpp @slaren
95-
/tests/test-thread-safety.cpp @slaren
9684
/tools/batched-bench/ @ggerganov
97-
/tools/llama-bench/ @slaren
9885
/tools/main/ @ggerganov
9986
/tools/mtmd/ @ngxson
10087
/tools/perplexity/ @ggerganov
@@ -106,8 +93,6 @@
10693
/tools/tokenize/ @ggerganov
10794
/tools/tts/ @ggerganov
10895
/vendor/ @ggerganov
109-
/.clang-format @slaren
110-
/.clang-tidy @slaren
11196
/AUTHORS @ggerganov
11297
/CMakeLists.txt @ggerganov
11398
/CONTRIBUTING.md @ggerganov

ci/run.sh

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ sd=`dirname $0`
4545
cd $sd/../
4646
SRC=`pwd`
4747

48-
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
48+
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
4949

5050
if [ ! -z ${GG_BUILD_METAL} ]; then
5151
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@@ -428,10 +428,10 @@ function gg_run_qwen3_0_6b {
428428

429429
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
430430

431-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
432-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
433-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
434-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
431+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
432+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
433+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
434+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
435435

436436
function check_ppl {
437437
qnt="$1"
@@ -523,8 +523,8 @@ function gg_run_embd_bge_small {
523523

524524
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
525525

526-
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
527-
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
526+
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
527+
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
528528

529529
set +e
530530
}
@@ -564,7 +564,7 @@ function gg_run_rerank_tiny {
564564
model_f16="${path_models}/ggml-model-f16.gguf"
565565

566566
# for this model, the SEP token is "</s>"
567-
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
567+
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
568568

569569
# sample output
570570
# rerank score 0: 0.029

common/arg.cpp

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -694,6 +694,12 @@ static bool is_autoy(const std::string & value) {
694694
}
695695

696696
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
697+
// default values specific to example
698+
// note: we place it here instead of inside server.cpp to allow llama-gen-docs to pick it up
699+
if (ex == LLAMA_EXAMPLE_SERVER) {
700+
params.use_jinja = true;
701+
}
702+
697703
// load dynamic backends
698704
ggml_backend_load_all();
699705

@@ -974,7 +980,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
974980
[](common_params & params) {
975981
params.kv_unified = true;
976982
}
977-
).set_env("LLAMA_ARG_KV_SPLIT"));
983+
).set_env("LLAMA_ARG_KV_UNIFIED"));
978984
add_opt(common_arg(
979985
{"--no-context-shift"},
980986
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -1232,6 +1238,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
12321238
[](common_params & params, const std::string & value) {
12331239
const auto sampler_names = string_split<std::string>(value, ';');
12341240
params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
1241+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
12351242
}
12361243
).set_sparam());
12371244
add_opt(common_arg(
@@ -1261,27 +1268,31 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
12611268
[](common_params & params, const std::string & value) {
12621269
params.sampling.temp = std::stof(value);
12631270
params.sampling.temp = std::max(params.sampling.temp, 0.0f);
1271+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP;
12641272
}
12651273
).set_sparam());
12661274
add_opt(common_arg(
12671275
{"--top-k"}, "N",
12681276
string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
12691277
[](common_params & params, int value) {
12701278
params.sampling.top_k = value;
1279+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
12711280
}
12721281
).set_sparam());
12731282
add_opt(common_arg(
12741283
{"--top-p"}, "N",
12751284
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
12761285
[](common_params & params, const std::string & value) {
12771286
params.sampling.top_p = std::stof(value);
1287+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P;
12781288
}
12791289
).set_sparam());
12801290
add_opt(common_arg(
12811291
{"--min-p"}, "N",
12821292
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
12831293
[](common_params & params, const std::string & value) {
12841294
params.sampling.min_p = std::stof(value);
1295+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P;
12851296
}
12861297
).set_sparam());
12871298
add_opt(common_arg(
@@ -1296,13 +1307,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
12961307
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
12971308
[](common_params & params, const std::string & value) {
12981309
params.sampling.xtc_probability = std::stof(value);
1310+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY;
12991311
}
13001312
).set_sparam());
13011313
add_opt(common_arg(
13021314
{"--xtc-threshold"}, "N",
13031315
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
13041316
[](common_params & params, const std::string & value) {
13051317
params.sampling.xtc_threshold = std::stof(value);
1318+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD;
13061319
}
13071320
).set_sparam());
13081321
add_opt(common_arg(
@@ -1321,13 +1334,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13211334
}
13221335
params.sampling.penalty_last_n = value;
13231336
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
1337+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N;
13241338
}
13251339
).set_sparam());
13261340
add_opt(common_arg(
13271341
{"--repeat-penalty"}, "N",
13281342
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
13291343
[](common_params & params, const std::string & value) {
13301344
params.sampling.penalty_repeat = std::stof(value);
1345+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT;
13311346
}
13321347
).set_sparam());
13331348
add_opt(common_arg(
@@ -1425,20 +1440,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
14251440
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
14261441
[](common_params & params, int value) {
14271442
params.sampling.mirostat = value;
1443+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT;
14281444
}
14291445
).set_sparam());
14301446
add_opt(common_arg(
14311447
{"--mirostat-lr"}, "N",
14321448
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
14331449
[](common_params & params, const std::string & value) {
14341450
params.sampling.mirostat_eta = std::stof(value);
1451+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA;
14351452
}
14361453
).set_sparam());
14371454
add_opt(common_arg(
14381455
{"--mirostat-ent"}, "N",
14391456
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
14401457
[](common_params & params, const std::string & value) {
14411458
params.sampling.mirostat_tau = std::stof(value);
1459+
params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU;
14421460
}
14431461
).set_sparam());
14441462
add_opt(common_arg(
@@ -2476,11 +2494,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24762494
).set_examples({LLAMA_EXAMPLE_SERVER}));
24772495
add_opt(common_arg(
24782496
{"--jinja"},
2479-
"use jinja template for chat (default: disabled)",
2497+
string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
24802498
[](common_params & params) {
24812499
params.use_jinja = true;
24822500
}
24832501
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
2502+
add_opt(common_arg(
2503+
{"--no-jinja"},
2504+
string_format("disable jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
2505+
[](common_params & params) {
2506+
params.use_jinja = false;
2507+
}
2508+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
24842509
add_opt(common_arg(
24852510
{"--reasoning-format"}, "FORMAT",
24862511
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@@ -2614,7 +2639,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26142639
[](common_params &, const std::string & value) {
26152640
common_log_set_file(common_log_main(), value.c_str());
26162641
}
2617-
));
2642+
).set_env("LLAMA_LOG_FILE"));
26182643
add_opt(common_arg(
26192644
{"--log-colors"}, "[on|off|auto]",
26202645
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"

0 commit comments

Comments
 (0)