diff --git a/exploration/create_dataset.py b/exploration/create_dataset.py index faaed22..4166a1f 100644 --- a/exploration/create_dataset.py +++ b/exploration/create_dataset.py @@ -54,6 +54,7 @@ def parse_args(): p.add_argument("--batch", type=int, default=4) p.add_argument("--validate", type=int, default=20, help="Print class sequences for first N problems") p.add_argument("--max_new_tokens", type=int, default=1024) + p.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility (e.g. generation with temperature)") return p.parse_args() # ── HELPERS ───────────────────────────────────────────────── @@ -208,6 +209,9 @@ def main(): args = parse_args() os.makedirs(args.out, exist_ok=True) + torch.manual_seed(args.seed) + np.random.seed(args.seed) + ckpt_raw = os.path.join(args.out, "raw_extractions.pkl") ckpt_features = os.path.join(args.out, "all_sentences_features.pkl") ckpt_with_neu = os.path.join(args.out, "all_sentences_features_with_neutral.pkl") diff --git a/exploration/create_dataset_multigpu.py b/exploration/create_dataset_multigpu.py new file mode 100644 index 0000000..96841c8 --- /dev/null +++ b/exploration/create_dataset_multigpu.py @@ -0,0 +1,203 @@ +""" +Multi-GPU launcher for create_dataset: shards problems across N GPUs (e.g. 8x A100), +runs Phase 1 + Phase 2 per shard, then merges and writes final outputs. + +Usage: + python create_dataset_multigpu.py \ + --model deepseek-ai/DeepSeek-R1-Distill-Qwen-14B \ + --base Qwen/Qwen2.5-14B \ + --dataset openai/gsm8k \ + --split train \ + --n 7473 \ + --layer 28 \ + --out ./gsm8k_output \ + --ngpus 8 +""" + +from __future__ import annotations + +import os +import sys +import pickle +import argparse +import subprocess +import tempfile + +# Import from create_dataset for merge/flatten logic and validation +from create_dataset import ( + CLASSES_ORDERED, + parse_args as base_parse_args, + print_validation, +) + + +def parse_args(): + p = argparse.ArgumentParser( + description="Run create_dataset across multiple GPUs (shard by problem index)." + ) + p.add_argument("--model", default="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B") + p.add_argument("--base", default="Qwen/Qwen2.5-14B") + p.add_argument("--clf", default="Qwen/Qwen2.5-7B-Instruct") + p.add_argument("--dataset", default="HuggingFaceH4/MATH-500") + p.add_argument("--split", default="test") + p.add_argument("--layer", type=int, default=28) + p.add_argument("--n", type=int, default=500) + p.add_argument("--out", default="./dataset_output") + p.add_argument("--batch", type=int, default=4) + p.add_argument("--validate", type=int, default=20) + p.add_argument("--max_new_tokens", type=int, default=1024) + p.add_argument( + "--ngpus", + type=int, + default=8, + help="Number of GPUs to use (default 8 for 8x A100)", + ) + p.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility (passed to workers)") + return p.parse_args() + + +def main(): + args = parse_args() + args.out = os.path.abspath(args.out) + os.makedirs(args.out, exist_ok=True) + + # Load dataset once to get size and validate + from datasets import load_dataset + + config = "default" if "MATH" in args.dataset else "main" + ds = load_dataset(args.dataset, config, split=f"{args.split}[:{args.n}]") + key = "problem" if "problem" in ds[0] else "question" + n_problems = len(ds) + print(f"Dataset: {args.dataset} {args.split}, {n_problems} problems") + print(f"Sharding across {args.ngpus} GPUs\n") + + ngpus = min(args.ngpus, n_problems) + if ngpus < args.ngpus: + print(f"Using {ngpus} GPUs (n_problems={n_problems} < ngpus={args.ngpus})") + + # Shard boundaries: [0, s1), [s1, s2), ..., [s_{n-1}, n_problems) + shard_size = (n_problems + ngpus - 1) // ngpus + ranges = [] + for i in range(ngpus): + start = i * shard_size + end = min(start + shard_size, n_problems) + if start < n_problems: + ranges.append((i, start, end)) + + # Args for workers (out must be absolute since workers run with cwd=script_dir) + args_dict = { + "out": os.path.abspath(args.out), + "dataset": args.dataset, + "split": args.split, + "n": args.n, + "base": args.base, + "model": args.model, + "clf": args.clf, + "layer": args.layer, + "batch": args.batch, + "seed": args.seed, + } + + script_dir = os.path.dirname(os.path.abspath(__file__)) + worker_script = os.path.join(script_dir, "create_dataset_worker.py") + + with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as f: + args_pickle = f.name + try: + pickle.dump(args_dict, open(args_pickle, "wb")) + + procs = [] + for rank, start_idx, end_idx in ranges: + env = os.environ.copy() + env["CUDA_VISIBLE_DEVICES"] = str(rank) + cmd = [ + sys.executable, + worker_script, + str(rank), + str(start_idx), + str(end_idx), + args_pickle, + ] + p = subprocess.Popen(cmd, env=env, cwd=script_dir) + procs.append((rank, p)) + + for rank, p in procs: + p.wait() + if p.returncode != 0: + raise RuntimeError(f"Worker rank {rank} exited with code {p.returncode}") + + print("\nAll workers finished. Merging shards...") + finally: + if os.path.exists(args_pickle): + os.unlink(args_pickle) + + # Merge raw_extractions from all shards + all_extractions = {} + for rank, start_idx, end_idx in ranges: + shard_raw = os.path.join(args.out, f"shard_{rank}", "raw_extractions.pkl") + if not os.path.exists(shard_raw): + continue + shard_data = pickle.load(open(shard_raw, "rb")) + for pid, data in shard_data.items(): + all_extractions[pid] = data + + # Flatten to final format (same as create_dataset) + ckpt_features = os.path.join(args.out, "all_sentences_features.pkl") + ckpt_with_neu = os.path.join(args.out, "all_sentences_features_with_neutral.pkl") + ckpt_cot = os.path.join(args.out, "cot_data.pkl") + ckpt_raw_merged = os.path.join(args.out, "raw_extractions.pkl") + + pickle.dump(all_extractions, open(ckpt_raw_merged, "wb")) + + all_features = [] + all_features_with_neutral = [] + for pid, data in all_extractions.items(): + for feat in data["sentence_features"]: + if "stage" not in feat: + continue + entry = { + "hidden_state": feat["hidden_state"], + "hidden_state_last": feat["hidden_state_last"], + "problem_id": pid, + "sentence_idx": feat["sentence_idx"], + "sentence": feat["sentence"], + "stage": feat["stage"], + "is_anchor": feat.get("is_anchor", feat["stage"] != "NEUTRAL"), + } + all_features_with_neutral.append(entry) + if feat["stage"] != "NEUTRAL": + all_features.append(entry) + + pickle.dump(all_features, open(ckpt_features, "wb")) + pickle.dump(all_features_with_neutral, open(ckpt_with_neu, "wb")) + pickle.dump( + { + pid: { + "problem": d["problem"], + "cot": d["cot"], + "sentences": d["sentences"], + } + for pid, d in all_extractions.items() + }, + open(ckpt_cot, "wb"), + ) + + print_validation(all_extractions, n=args.validate) + + stage_counts = {} + for f in all_features_with_neutral: + stage_counts[f["stage"]] = stage_counts.get(f["stage"], 0) + 1 + + print(f"\n{'='*60}") + print("DONE (multi-GPU merge)") + print(f" Non-neutral features : {len(all_features)}") + print(f" All features : {len(all_features_with_neutral)}") + print(f" Output dir : {args.out}") + print(f"\nStage distribution:") + for cls in CLASSES_ORDERED: + print(f" {cls:30s}: {stage_counts.get(cls, 0)}") + print("="*60) + + +if __name__ == "__main__": + main() diff --git a/exploration/create_dataset_worker.py b/exploration/create_dataset_worker.py new file mode 100644 index 0000000..3829f75 --- /dev/null +++ b/exploration/create_dataset_worker.py @@ -0,0 +1,154 @@ +""" +Worker for create_dataset_multigpu: runs Phase 1 + Phase 2 on a single GPU +for a slice of problem indices. Invoked by the launcher with +CUDA_VISIBLE_DEVICES set so this process sees only one GPU. + +Usage (called by create_dataset_multigpu.py, not directly): + CUDA_VISIBLE_DEVICES= python create_dataset_worker.py +""" + +from __future__ import annotations + +import os +import sys +import gc +import pickle +import argparse + +# Set device before any torch/cuda import (launcher sets CUDA_VISIBLE_DEVICES when spawning) +import torch +import numpy as np +from tqdm import tqdm +from transformers import AutoTokenizer, AutoModelForCausalLM + +from create_dataset import ( + CLASSES_ORDERED, + split_into_sentences, + get_sentence_token_ranges, + process_problem, + get_classification_prompt, + classify_sentences, +) + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +dtype = torch.bfloat16 + + +def parse_worker_args(): + p = argparse.ArgumentParser() + p.add_argument("rank", type=int) + p.add_argument("start_idx", type=int) + p.add_argument("end_idx", type=int) + p.add_argument("args_pickle", type=str) + return p.parse_args() + + +def run_worker(rank: int, start_idx: int, end_idx: int, args_dict: dict): + out = args_dict["out"] + shard_dir = os.path.join(out, f"shard_{rank}") + os.makedirs(shard_dir, exist_ok=True) + ckpt_raw = os.path.join(shard_dir, "raw_extractions.pkl") + + from datasets import load_dataset + + dataset_name = args_dict["dataset"] + split = args_dict["split"] + n = args_dict["n"] + config = "default" if "MATH" in dataset_name else "main" + ds = load_dataset(dataset_name, config, split=f"{split}[:{n}]") + key = "problem" if "problem" in ds[0] else "question" + problems = [ds[i][key] for i in range(len(ds))] + + my_pids = list(range(start_idx, min(end_idx, len(problems)))) + if not my_pids: + print(f"[Rank {rank}] No problems in range [{start_idx}, {end_idx})") + return + + seed = args_dict.get("seed", 42) + + if os.path.exists(ckpt_raw): + all_extractions = pickle.load(open(ckpt_raw, "rb")) + done_pids = set(all_extractions.keys()) & set(my_pids) + else: + all_extractions = {} + done_pids = set() + + remaining = [(pid, problems[pid]) for pid in my_pids if pid not in done_pids] + print(f"[Rank {rank}] Problems {start_idx}-{end_idx}: {len(remaining)} remaining, {len(done_pids)} done") + + # Phase 1: generation + extraction + if remaining: + tokenizer = AutoTokenizer.from_pretrained(args_dict["base"], trust_remote_code=True) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + model = AutoModelForCausalLM.from_pretrained( + args_dict["model"], + torch_dtype=dtype, + low_cpu_mem_usage=True, + ).to(DEVICE) + model.eval() + + for pid, problem in tqdm(remaining, desc=f"Rank {rank} Phase1"): + torch.manual_seed(seed + pid) + np.random.seed(seed + pid) + result = process_problem(problem, model, tokenizer, args_dict["layer"]) + if result: + all_extractions[pid] = result + if (pid + 1) % 25 == 0: + pickle.dump(all_extractions, open(ckpt_raw, "wb")) + torch.cuda.empty_cache() + gc.collect() + + pickle.dump(all_extractions, open(ckpt_raw, "wb")) + del model + torch.cuda.empty_cache() + gc.collect() + + # Phase 2: classification + unclassified = [] + unclassified_refs = [] + for pid, data in all_extractions.items(): + for i, feat in enumerate(data["sentence_features"]): + if "stage" not in feat: + unclassified.append(feat["sentence"]) + unclassified_refs.append((pid, i)) + + if unclassified: + clf_tokenizer = AutoTokenizer.from_pretrained(args_dict["clf"], trust_remote_code=True) + clf_tokenizer.pad_token = clf_tokenizer.eos_token + clf_tokenizer.padding_side = "left" + classifier = AutoModelForCausalLM.from_pretrained( + args_dict["clf"], + torch_dtype=dtype, + low_cpu_mem_usage=True, + ).to(DEVICE) + classifier.eval() + + classifications = classify_sentences( + unclassified, + clf_tokenizer, + classifier, + batch_size=args_dict.get("batch", 4), + ) + + for (pid, feat_idx), cls in zip(unclassified_refs, classifications): + all_extractions[pid]["sentence_features"][feat_idx]["stage"] = cls + all_extractions[pid]["sentence_features"][feat_idx]["is_anchor"] = cls != "NEUTRAL" + + pickle.dump(all_extractions, open(ckpt_raw, "wb")) + del classifier, clf_tokenizer + torch.cuda.empty_cache() + gc.collect() + + print(f"[Rank {rank}] Done. Extracted {len(all_extractions)} problems.") + + +def main(): + args = parse_worker_args() + with open(args.args_pickle, "rb") as f: + args_dict = pickle.load(f) + run_worker(args.rank, args.start_idx, args.end_idx, args_dict) + + +if __name__ == "__main__": + main() diff --git a/huggingface_scripts/hf_rename_folder.py b/huggingface_scripts/hf_rename_folder.py new file mode 100644 index 0000000..dd84824 --- /dev/null +++ b/huggingface_scripts/hf_rename_folder.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +""" +Rename a folder (path) in a Hugging Face Hub repository. + +Moves all files from old_path to new_path in the same repo: +- Lists files under old_path +- Downloads each, uploads to new_path, deletes the original + +Requires: pip install huggingface_hub + +Authentication: + - Run `huggingface-cli login` or set HF_TOKEN in the environment. + - Your token must have write access to the repo. +""" + +import argparse +import tempfile +from pathlib import Path + +from huggingface_hub import HfApi, hf_hub_download + + +def _norm_path(p: str) -> str: + """Normalize path: no leading slash, ensure trailing slash for directory.""" + p = p.strip("/").replace("\\", "/") + return p + "/" if p else "" + + +def rename_folder( + repo_id: str, + old_path: str, + new_path: str, + repo_type: str = "model", + revision: str = "main", + token: str = None, + dry_run: bool = False, +): + """ + Move all files from old_path to new_path inside the same repo. + + Args: + repo_id: "username/repo-name". + old_path: Current folder path in the repo (e.g. "data/raw" or "data/raw/"). + new_path: Target folder path (e.g. "data/processed"). + repo_type: "model", "dataset", or "space". + revision: Branch to work on (default: main). + token: HF token (default: HF_TOKEN env or huggingface-cli login). + dry_run: If True, only print what would be moved; do not upload or delete. + """ + api = HfApi(token=token) + old_prefix = _norm_path(old_path) + new_prefix = _norm_path(new_path) + + if old_prefix == new_prefix: + print("Old and new path are the same. Nothing to do.") + return + + all_files = api.list_repo_files( + repo_id=repo_id, + repo_type=repo_type, + revision=revision, + token=token, + ) + to_move = [f for f in all_files if f == old_prefix.rstrip("/") or f.startswith(old_prefix)] + + if not to_move: + print(f"No files found under '{old_path}'. Check the path and repo_type.") + return + + print(f"Found {len(to_move)} file(s) under '{old_path}' to move to '{new_path}'.") + if dry_run: + for f in to_move: + rel = f[len(old_prefix):] if f.startswith(old_prefix) else "" + print(f" {f} -> {new_prefix.rstrip('/') + '/' + rel}") + return + + with tempfile.TemporaryDirectory(prefix="hf_rename_") as tmpdir: + for path_in_repo in to_move: + if path_in_repo.endswith("/"): + continue + if path_in_repo.startswith(old_prefix): + rel = path_in_repo[len(old_prefix):] + else: + rel = Path(path_in_repo).name + new_file_path = (new_prefix + rel).rstrip("/") + + print(f" {path_in_repo} -> {new_file_path}") + local_file = hf_hub_download( + repo_id=repo_id, + filename=path_in_repo, + repo_type=repo_type, + revision=revision, + token=token, + local_dir=tmpdir, + local_dir_use_symlinks=False, + force_download=True, + ) + api.upload_file( + path_or_fileobj=local_file, + path_in_repo=new_file_path, + repo_id=repo_id, + repo_type=repo_type, + revision=revision, + token=token, + commit_message=f"Move {path_in_repo} to {new_file_path}", + ) + api.delete_file( + path_in_repo=path_in_repo, + repo_id=repo_id, + repo_type=repo_type, + revision=revision, + token=token, + commit_message=f"Remove old path {path_in_repo}", + ) + + print("Done.") + + +def main(): + parser = argparse.ArgumentParser( + description="Rename (move) a folder in a Hugging Face repo.", + ) + parser.add_argument( + "repo_id", + help='Repo id, e.g. "username/repo-name"', + ) + parser.add_argument( + "old_path", + help='Current folder path in the repo (e.g. data/raw)', + ) + parser.add_argument( + "new_path", + help='Target folder path (e.g. data/processed)', + ) + parser.add_argument( + "--repo-type", + choices=("model", "dataset", "space"), + default="model", + help="Repo type (default: model)", + ) + parser.add_argument( + "--revision", + default="main", + help="Branch to work on (default: main)", + ) + parser.add_argument( + "--token", + default=None, + help="Hugging Face token (default: HF_TOKEN env or huggingface-cli login)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Only list what would be moved; do not upload or delete", + ) + args = parser.parse_args() + + rename_folder( + repo_id=args.repo_id, + old_path=args.old_path, + new_path=args.new_path, + repo_type=args.repo_type, + revision=args.revision, + token=args.token, + dry_run=args.dry_run, + ) + + +if __name__ == "__main__": + main() diff --git a/huggingface_scripts/hf_repo_transfer.py b/huggingface_scripts/hf_repo_transfer.py new file mode 100644 index 0000000..a3c707d --- /dev/null +++ b/huggingface_scripts/hf_repo_transfer.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +""" +Transfer files from one Hugging Face Hub repository to another. + +Requires: pip install huggingface_hub + +Authentication: + - Run `huggingface-cli login` or set HF_TOKEN in the environment. + - Your token must have read access to the source repo and write access to the destination. +""" + +import argparse +import tempfile +import shutil +from pathlib import Path + +from huggingface_hub import ( + HfApi, + snapshot_download, +) + + +def transfer_repo( + source_repo: str, + dest_repo: str, + repo_type: str = "model", + source_revision: str = None, + dest_revision: str = "main", + path_in_repo: str = None, + create_dest: bool = True, + token: str = None, + allow_patterns: list = None, + ignore_patterns: list = None, +): + """ + Download all files from source_repo and upload them to dest_repo. + + Args: + source_repo: "username/repo-name" of the source repo. + dest_repo: "username/repo-name" of the destination repo. + repo_type: "model", "dataset", or "space". + source_revision: Branch/tag/commit of source (default: main). + dest_revision: Branch to push to (default: main). + path_in_repo: Subfolder in dest repo to upload into (e.g. "models/v1"). None = repo root. + create_dest: If True, create destination repo if it doesn't exist. + token: HF token (default: from HF_TOKEN env or huggingface-cli login). + allow_patterns: Only transfer files matching these globs (e.g. ["*.safetensors"]). + ignore_patterns: Skip files matching these globs. + """ + api = HfApi(token=token) + source_revision = source_revision or "main" + + with tempfile.TemporaryDirectory(prefix="hf_transfer_") as tmpdir: + local_path = Path(tmpdir) / "repo" + print(f"Downloading {source_repo} ({repo_type}) -> {local_path}") + snapshot_download( + repo_id=source_repo, + repo_type=repo_type, + revision=source_revision, + local_dir=str(local_path), + local_dir_use_symlinks=False, + token=token, + allow_patterns=allow_patterns, + ignore_patterns=ignore_patterns, + ) + + if create_dest: + try: + api.create_repo( + repo_id=dest_repo, + repo_type=repo_type, + exist_ok=True, + ) + print(f"Destination repo {dest_repo} ready (created or already exists).") + except Exception as e: + print(f"Note: create_repo: {e}") + + dest_path_msg = f" -> {path_in_repo}/" if path_in_repo else "" + print(f"Uploading {local_path} -> {dest_repo}{dest_path_msg} ({repo_type})") + api.upload_folder( + folder_path=str(local_path), + repo_id=dest_repo, + repo_type=repo_type, + revision=dest_revision, + path_in_repo=path_in_repo, + token=token, + commit_message=f"Transfer from {source_repo}", + ) + + print("Done.") + + +def main(): + parser = argparse.ArgumentParser( + description="Transfer files from one Hugging Face repo to another." + ) + parser.add_argument( + "source_repo", + help='Source repo id, e.g. "username/source-repo"', + ) + parser.add_argument( + "dest_repo", + help='Destination repo id, e.g. "other-user/dest-repo"', + ) + parser.add_argument( + "--repo-type", + choices=("model", "dataset", "space"), + default="model", + help="Repo type (default: model)", + ) + parser.add_argument( + "--source-revision", + default=None, + help="Source branch/tag/commit (default: main)", + ) + parser.add_argument( + "--dest-revision", + default="main", + help="Destination branch to push to (default: main)", + ) + parser.add_argument( + "--path-in-repo", + default=None, + metavar="PATH", + help="Subfolder in destination repo to upload into (e.g. models/v1). Default: repo root", + ) + parser.add_argument( + "--no-create", + action="store_true", + help="Do not create destination repo if it does not exist", + ) + parser.add_argument( + "--token", + default=None, + help="Hugging Face token (default: HF_TOKEN env or huggingface-cli login)", + ) + parser.add_argument( + "--allow-patterns", + nargs="+", + default=None, + help="Only transfer files matching these globs, e.g. --allow-patterns '*.safetensors' '*.json'", + ) + parser.add_argument( + "--ignore-patterns", + nargs="+", + default=None, + help="Skip files matching these globs", + ) + args = parser.parse_args() + + transfer_repo( + source_repo=args.source_repo, + dest_repo=args.dest_repo, + repo_type=args.repo_type, + source_revision=args.source_revision, + dest_revision=args.dest_revision, + path_in_repo=args.path_in_repo, + create_dest=not args.no_create, + token=args.token, + allow_patterns=args.allow_patterns, + ignore_patterns=args.ignore_patterns, + ) + + +if __name__ == "__main__": + main() diff --git a/huggingface_scripts/upload_to_hf.py b/huggingface_scripts/upload_to_hf.py new file mode 100644 index 0000000..8fda7bd --- /dev/null +++ b/huggingface_scripts/upload_to_hf.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +""" +Upload create_dataset output (e.g. gsm8k_output) to a Hugging Face dataset repo. + +Usage: + python upload_to_hf.py + python upload_to_hf.py --folder ./gsm8k_output --repo [yourname]/[yourrepo] --path-in-repo layer_20 + +Requires: pip install huggingface_hub, then login via: + python -c "from huggingface_hub import login; login()" +or set HF_TOKEN in the environment. +""" + +import argparse +import os + + +def main(): + p = argparse.ArgumentParser(description="Upload dataset output to Hugging Face") + p.add_argument( + "--folder", + default="./gsm8k_output", + help="Local folder containing raw_extractions.pkl, all_sentences_features.pkl, etc.", + ) + p.add_argument( + "--repo", + default="withmartian/gsm8k_qwen14b_SDS_traindata", + help="Hugging Face dataset repo id (e.g. username/repo_name)", + ) + p.add_argument( + "--merge-only", + action="store_true", + help="Upload only the 4 merged files (no shard_* dirs)", + ) + p.add_argument( + "--path-in-repo", + default=".", + help="Path inside the repo where to upload (e.g. 'layer_27' to put files in repo/layer_27/)", + ) + args = p.parse_args() + + folder = os.path.abspath(args.folder) + if not os.path.isdir(folder): + raise SystemExit(f"Folder not found: {folder}") + + from huggingface_hub import HfApi + + token = os.environ.get("HF_TOKEN") + api = HfApi(token=token) + + if args.merge_only: + files = [ + "raw_extractions.pkl", + "all_sentences_features.pkl", + "all_sentences_features_with_neutral.pkl", + "cot_data.pkl", + ] + for f in files: + path = os.path.join(folder, f) + if not os.path.isfile(path): + print(f"Skip (missing): {path}") + continue + print(f"Uploading {f} ...") + dest = os.path.join(args.path_in_repo, f).replace("\\", "/") + api.upload_file( + path_or_fileobj=path, + path_in_repo=dest, + repo_id=args.repo, + repo_type="dataset", + token=token, + ) + print("Done (merge-only).") + else: + print(f"Uploading folder {folder} to {args.repo} ({args.path_in_repo}) ...") + api.upload_folder( + folder_path=folder, + repo_id=args.repo, + repo_type="dataset", + path_in_repo=args.path_in_repo, + token=token, + ) + print("Done.") + + +if __name__ == "__main__": + main()