@@ -291,14 +291,16 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
291291 hf_tag = " default" ;
292292 }
293293
294- const bool offline = params.offline ;
295294 std::string model_endpoint = get_model_endpoint ();
296295 auto preset_url = model_endpoint + hf_repo + " /resolve/main/preset.ini" ;
297296
298297 // prepare local path for caching
299298 auto preset_fname = clean_file_name (hf_repo + " _preset.ini" );
300299 auto preset_path = fs_get_cache_file (preset_fname);
301- const int status = common_download_file_single (preset_url, preset_path, params.hf_token , offline);
300+ common_download_opts opts;
301+ opts.bearer_token = params.hf_token ;
302+ opts.offline = params.offline ;
303+ const int status = common_download_file_single (preset_url, preset_path, opts);
302304 const bool has_preset = status >= 200 && status < 400 ;
303305
304306 // remote preset is optional, so we don't error out if not found
@@ -341,10 +343,10 @@ static handle_model_result common_params_handle_model(struct common_params_model
341343 model.hf_file = model.path ;
342344 model.path = " " ;
343345 }
344- common_download_model_opts opts;
345- opts.download_mmproj = true ;
346+ common_download_opts opts;
347+ opts.bearer_token = bearer_token ;
346348 opts.offline = offline;
347- auto download_result = common_download_model (model, bearer_token, opts );
349+ auto download_result = common_download_model (model, opts, true );
348350
349351 if (download_result.model_path .empty ()) {
350352 LOG_ERR (" error: failed to download model from Hugging Face\n " );
@@ -365,9 +367,10 @@ static handle_model_result common_params_handle_model(struct common_params_model
365367 model.path = fs_get_cache_file (string_split<std::string>(f, ' /' ).back ());
366368 }
367369
368- common_download_model_opts opts;
370+ common_download_opts opts;
371+ opts.bearer_token = bearer_token;
369372 opts.offline = offline;
370- auto download_result = common_download_model (model, bearer_token, opts);
373+ auto download_result = common_download_model (model, opts);
371374 if (download_result.model_path .empty ()) {
372375 LOG_ERR (" error: failed to download model from %s\n " , model.url .c_str ());
373376 exit (1 );
@@ -2348,19 +2351,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23482351 }
23492352 ).set_env (" LLAMA_ARG_N_GPU_LAYERS" ));
23502353 add_opt (common_arg (
2351- {" -sm" , " --split-mode" }, " {none,layer,row}" ,
2354+ {" -sm" , " --split-mode" }, " {none,layer,row,tensor }" ,
23522355 " how to split the model across multiple GPUs, one of:\n "
23532356 " - none: use one GPU only\n "
2354- " - layer (default): split layers and KV across GPUs\n "
2355- " - row: split rows across GPUs" ,
2357+ " - layer (default): split layers and KV across GPUs (pipelined)\n "
2358+ " - row: split weight across GPUs by rows (parallelized)\n "
2359+ " - tensor: split weights and KV across GPUs (parallelized, EXPERIMENTAL)" ,
23562360 [](common_params & params, const std::string & value) {
2357- std::string arg_next = value;
2358- if (arg_next == " none" ) {
2361+ if (value == " none" ) {
23592362 params.split_mode = LLAMA_SPLIT_MODE_NONE ;
2360- } else if (arg_next == " layer" ) {
2363+ } else if (value == " layer" ) {
23612364 params.split_mode = LLAMA_SPLIT_MODE_LAYER ;
2362- } else if (arg_next == " row" ) {
2365+ } else if (value == " row" ) {
23632366 params.split_mode = LLAMA_SPLIT_MODE_ROW ;
2367+ } else if (value == " tensor" ) {
2368+ params.split_mode = LLAMA_SPLIT_MODE_TENSOR ;
23642369 } else {
23652370 throw std::invalid_argument (" invalid value" );
23662371 }
0 commit comments