diff --git a/scripts/configs/grid_search_scaling_sasrec_bce_amazons.sh b/scripts/configs/grid_search_scaling_sasrec_bce_amazons.sh
new file mode 100644
index 0000000..cab2cee
--- /dev/null
+++ b/scripts/configs/grid_search_scaling_sasrec_bce_amazons.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# # Configure visible GPUs for the run (comma-separated string of physical GPU ids).
+# export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-4,5,6,7}"
+
+# Define GPU groups used by grid_search. Each entry is a comma-separated list of GPU ids.
+# Example: GPU_GROUPS=("4,5" "6,7") to allocate two GPUs per trial.
+GPU_GROUPS=("0,1,2,3,4,5,6,7")
+
+if [[ ${#GPU_GROUPS[@]} -eq 0 ]]; then
+    echo "GPU_GROUPS must contain at least one entry" >&2
+    exit 1
+fi
+
+# We assume configs are provided in ./configs/
+BASE_DIR=$(dirname "$(realpath "$0")")
+CONFIG_DIR="${BASE_DIR}/configs"
+
+# Set up template config path.
+TEMPLATE_PATH="${CONFIG_DIR}/seqrec/template.yaml"
+
+# Set up search config path options.
+SCALING_SCALES=("s" "b" "l" "xl" "xxl")
+SEARCH_PREFIX="${CONFIG_DIR}/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon"
+
+# Set up output directory.
+OUTPUT_ROOT="${BASE_DIR}/../outputs/seqrec/scaling/sasrec_bce_amazons"
+
+# Set up main module to run.
+MAIN_MODULE="genrec.main_seqrec"
+
+# Optional dryrun/rerun controls (set to desired exp_id or leave empty).
+DRYRUN_EXP_ID="${DRYRUN_EXP_ID:-}"
+RERUN_EXP_ID="${RERUN_EXP_ID:-}"
+
+if [[ -n "$DRYRUN_EXP_ID" && -n "$RERUN_EXP_ID" ]]; then
+    echo "Set either DRYRUN_EXP_ID or RERUN_EXP_ID, not both." >&2
+    exit 1
+fi
+
+# If dryrun or rerun is specified, adjust SEARCH_PATH accordingly.
+if [[ -n "$DRYRUN_EXP_ID" ]]; then
+    SEARCH_PATH="${OUTPUT_ROOT}/${DRYRUN_EXP_ID}/search.yaml"
+elif [[ -n "$RERUN_EXP_ID" ]]; then
+    SEARCH_PATH="${OUTPUT_ROOT}/${RERUN_EXP_ID}/search.yaml"
+fi
+
+EXTRA_ARGS=()
+for grp in "${GPU_GROUPS[@]}"; do
+    EXTRA_ARGS+=(--gpu_groups "$grp")
+done
+
+if [[ -n "$DRYRUN_EXP_ID" ]]; then
+    EXTRA_ARGS+=(--dryrun "$DRYRUN_EXP_ID")
+elif [[ -n "$RERUN_EXP_ID" ]]; then
+    EXTRA_ARGS+=(--rerun "$RERUN_EXP_ID")
+fi
+
+run_grid_search() {
+    local search_path="$1"
+    # Executes one grid search invocation for the provided search config.
+    poetry run python scripts/grid_search.py \
+        --template "${TEMPLATE_PATH}" \
+        --search "${search_path}" \
+        --main "${MAIN_MODULE}" \
+        --output_root "${OUTPUT_ROOT}" \
+        "${EXTRA_ARGS[@]}"
+}
+
+if [[ -n "$DRYRUN_EXP_ID" || -n "$RERUN_EXP_ID" ]]; then
+    run_grid_search "${SEARCH_PATH}"
+else
+    for scale in "${SCALING_SCALES[@]}"; do
+        run_grid_search "${SEARCH_PREFIX}_${scale}.yaml"
+    done
+fi
diff --git a/scripts/configs/grid_search_scaling_sasrec_bce_movielens.sh b/scripts/configs/grid_search_scaling_sasrec_bce_movielens.sh
new file mode 100644
index 0000000..73098cb
--- /dev/null
+++ b/scripts/configs/grid_search_scaling_sasrec_bce_movielens.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# # Configure visible GPUs for the run (comma-separated string of physical GPU ids).
+# export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-4,5,6,7}"
+
+# Define GPU groups used by grid_search. Each entry is a comma-separated list of GPU ids.
+# Example: GPU_GROUPS=("4,5" "6,7") to allocate two GPUs per trial.
+GPU_GROUPS=("0,1,2,3,4,5,6,7")
+
+if [[ ${#GPU_GROUPS[@]} -eq 0 ]]; then
+    echo "GPU_GROUPS must contain at least one entry" >&2
+    exit 1
+fi
+
+# We assume configs are provided in ./configs/
+BASE_DIR=$(dirname "$(realpath "$0")")
+CONFIG_DIR="${BASE_DIR}/configs"
+
+# Set up template config path.
+TEMPLATE_PATH="${CONFIG_DIR}/seqrec/template.yaml"
+
+# Set up search config path options.
+# SCALING_SCALES=("xxs" "xs" "s" "b" "l" "xl" "xxl")
+SCALING_SCALES=("s" "b")
+SEARCH_PREFIX="${CONFIG_DIR}/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens"
+
+# Set up output directory.
+OUTPUT_ROOT="${BASE_DIR}/../outputs/seqrec/scaling/sasrec_bce_movielens"
+
+# Set up main module to run.
+MAIN_MODULE="genrec.main_seqrec"
+
+# Optional dryrun/rerun controls (set to desired exp_id or leave empty).
+DRYRUN_EXP_ID="${DRYRUN_EXP_ID:-}"
+RERUN_EXP_ID="${RERUN_EXP_ID:-}"
+
+if [[ -n "$DRYRUN_EXP_ID" && -n "$RERUN_EXP_ID" ]]; then
+    echo "Set either DRYRUN_EXP_ID or RERUN_EXP_ID, not both." >&2
+    exit 1
+fi
+
+# If dryrun or rerun is specified, adjust SEARCH_PATH accordingly.
+if [[ -n "$DRYRUN_EXP_ID" ]]; then
+    SEARCH_PATH="${OUTPUT_ROOT}/${DRYRUN_EXP_ID}/search.yaml"
+elif [[ -n "$RERUN_EXP_ID" ]]; then
+    SEARCH_PATH="${OUTPUT_ROOT}/${RERUN_EXP_ID}/search.yaml"
+fi
+
+EXTRA_ARGS=()
+for grp in "${GPU_GROUPS[@]}"; do
+    EXTRA_ARGS+=(--gpu_groups "$grp")
+done
+
+if [[ -n "$DRYRUN_EXP_ID" ]]; then
+    EXTRA_ARGS+=(--dryrun "$DRYRUN_EXP_ID")
+elif [[ -n "$RERUN_EXP_ID" ]]; then
+    EXTRA_ARGS+=(--rerun "$RERUN_EXP_ID")
+fi
+
+run_grid_search() {
+    local search_path="$1"
+    # Executes one grid search invocation for the provided search config.
+    poetry run python scripts/grid_search.py \
+        --template "${TEMPLATE_PATH}" \
+        --search "${search_path}" \
+        --main "${MAIN_MODULE}" \
+        --output_root "${OUTPUT_ROOT}" \
+        "${EXTRA_ARGS[@]}"
+}
+
+if [[ -n "$DRYRUN_EXP_ID" || -n "$RERUN_EXP_ID" ]]; then
+    run_grid_search "${SEARCH_PATH}"
+else
+    for scale in "${SCALING_SCALES[@]}"; do
+        run_grid_search "${SEARCH_PREFIX}_${scale}.yaml"
+    done
+fi
diff --git a/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_b.yaml b/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_b.yaml
new file mode 100644
index 0000000..4fdcdeb
--- /dev/null
+++ b/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_b.yaml
@@ -0,0 +1,77 @@
+# SASRec (Base) + BCE on Amazon-2018
+
+# global settings
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+test_eval: false  # whether to run evaluation on the test set instead of validation set
+save_predictions: false  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: seqrec
+
+    # specific path to interaction data file for Amazon-2018 dataset.
+    search__interaction_data_path:
+    - ./data/amazon2018/proc/user2item.pkl
+    
+    max_seq_length: 100
+
+# collator settings
+collator:
+    type: seqrec
+
+    num_negative_samples: 16
+    negative_sampling_strategy: uniform
+
+# model settings
+model:
+    type: sasrec
+
+    config:
+        # base model parameters
+        search__hidden_size: [512]
+        search__num_attention_heads: [8]
+        search__num_hidden_layers: [6]
+
+        # subclass model parameters
+        search__attention_dropout: [0.1]
+
+# trainer settings
+trainer:
+    type: bce
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+
+        # training arguments - Optimization & schedule
+        search__num_train_epochs: [100]
+        per_device_train_batch_size: 256
+        per_device_eval_batch_size: 512
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        search__learning_rate: [1.0e-3]
+        search__weight_decay: [0.1]
+        lr_scheduler_type: cosine
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        metric_for_best_model: ndcg@5  # should exist in the metrics
+
+        # training arguments - Parallelism & precision
+        dataloader_num_workers: 4
+        gradient_checkpointing: true
+        bf16: true
+        tf32: false
+
+        # base trainer parameters
+        norm_embeddings: false  # whether to L2-normalize user and item embeddings
+        eval_interval: 5  # run metrics every epoch
+        metrics:
+        - ["hr", {}]
+        - ["ndcg", {}]
+        - ["popularity", {p: [0.1, 0.2]}]
+        - ["unpopularity", {p: [0.2, 0.4]}]
+        model_loss_weight: 1.0
+        top_k: [1, 5, 10]
+
diff --git a/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_l.yaml b/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_l.yaml
new file mode 100644
index 0000000..e32e018
--- /dev/null
+++ b/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_l.yaml
@@ -0,0 +1,77 @@
+# SASRec (Large) + BCE on Amazon-2018
+
+# global settings
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+test_eval: false  # whether to run evaluation on the test set instead of validation set
+save_predictions: false  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: seqrec
+
+    # specific path to interaction data file for Amazon-2018 dataset.
+    search__interaction_data_path:
+    - ./data/amazon2018/proc/user2item.pkl
+    
+    max_seq_length: 100
+
+# collator settings
+collator:
+    type: seqrec
+
+    num_negative_samples: 16
+    negative_sampling_strategy: uniform
+
+# model settings
+model:
+    type: sasrec
+
+    config:
+        # base model parameters
+        search__hidden_size: [768]
+        search__num_attention_heads: [12]
+        search__num_hidden_layers: [8]
+
+        # subclass model parameters
+        search__attention_dropout: [0.1]
+
+# trainer settings
+trainer:
+    type: bce
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+
+        # training arguments - Optimization & schedule
+        search__num_train_epochs: [100]
+        per_device_train_batch_size: 256
+        per_device_eval_batch_size: 512
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        search__learning_rate: [5.0e-4]
+        search__weight_decay: [0.1]
+        lr_scheduler_type: cosine
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        metric_for_best_model: ndcg@5  # should exist in the metrics
+
+        # training arguments - Parallelism & precision
+        dataloader_num_workers: 4
+        gradient_checkpointing: true
+        bf16: true
+        tf32: false
+
+        # base trainer parameters
+        norm_embeddings: false  # whether to L2-normalize user and item embeddings
+        eval_interval: 5  # run metrics every epoch
+        metrics:
+        - ["hr", {}]
+        - ["ndcg", {}]
+        - ["popularity", {p: [0.1, 0.2]}]
+        - ["unpopularity", {p: [0.2, 0.4]}]
+        model_loss_weight: 1.0
+        top_k: [1, 5, 10]
+
diff --git a/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_s.yaml b/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_s.yaml
new file mode 100644
index 0000000..a92bfe3
--- /dev/null
+++ b/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_s.yaml
@@ -0,0 +1,77 @@
+# SASRec (Small) + BCE on Amazon-2018
+
+# global settings
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+test_eval: false  # whether to run evaluation on the test set instead of validation set
+save_predictions: false  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: seqrec
+
+    # specific path to interaction data file for Amazon-2018 dataset.
+    search__interaction_data_path:
+    - ./data/amazon2018/proc/user2item.pkl
+    
+    max_seq_length: 100
+
+# collator settings
+collator:
+    type: seqrec
+
+    num_negative_samples: 16
+    negative_sampling_strategy: uniform
+
+# model settings
+model:
+    type: sasrec
+
+    config:
+        # base model parameters
+        search__hidden_size: [256]
+        search__num_attention_heads: [4]
+        search__num_hidden_layers: [4]
+
+        # subclass model parameters
+        search__attention_dropout: [0.1]
+
+# trainer settings
+trainer:
+    type: bce
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+
+        # training arguments - Optimization & schedule
+        search__num_train_epochs: [100]
+        per_device_train_batch_size: 256
+        per_device_eval_batch_size: 512
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        search__learning_rate: [1.0e-3]
+        search__weight_decay: [0.1]
+        lr_scheduler_type: cosine
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        metric_for_best_model: ndcg@5  # should exist in the metrics
+
+        # training arguments - Parallelism & precision
+        dataloader_num_workers: 4
+        gradient_checkpointing: true
+        bf16: true
+        tf32: false
+
+        # base trainer parameters
+        norm_embeddings: false  # whether to L2-normalize user and item embeddings
+        eval_interval: 5  # run metrics every epoch
+        metrics:
+        - ["hr", {}]
+        - ["ndcg", {}]
+        - ["popularity", {p: [0.1, 0.2]}]
+        - ["unpopularity", {p: [0.2, 0.4]}]
+        model_loss_weight: 1.0
+        top_k: [1, 5, 10]
+
diff --git a/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_xl.yaml b/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_xl.yaml
new file mode 100644
index 0000000..2bccc65
--- /dev/null
+++ b/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_xl.yaml
@@ -0,0 +1,77 @@
+# SASRec (XL) + BCE on Amazon-2018
+
+# global settings
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+test_eval: false  # whether to run evaluation on the test set instead of validation set
+save_predictions: false  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: seqrec
+
+    # specific path to interaction data file for Amazon-2018 dataset.
+    search__interaction_data_path:
+    - ./data/amazon2018/proc/user2item.pkl
+    
+    max_seq_length: 100
+
+# collator settings
+collator:
+    type: seqrec
+
+    num_negative_samples: 16
+    negative_sampling_strategy: uniform
+
+# model settings
+model:
+    type: sasrec
+
+    config:
+        # base model parameters
+        search__hidden_size: [1024]
+        search__num_attention_heads: [16]
+        search__num_hidden_layers: [10]
+
+        # subclass model parameters
+        search__attention_dropout: [0.1]
+
+# trainer settings
+trainer:
+    type: bce
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+
+        # training arguments - Optimization & schedule
+        search__num_train_epochs: [100]
+        per_device_train_batch_size: 256
+        per_device_eval_batch_size: 512
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        search__learning_rate: [1.0e-4]
+        search__weight_decay: [0.1]
+        lr_scheduler_type: cosine
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        metric_for_best_model: ndcg@5  # should exist in the metrics
+
+        # training arguments - Parallelism & precision
+        dataloader_num_workers: 4
+        gradient_checkpointing: true
+        bf16: true
+        tf32: false
+
+        # base trainer parameters
+        norm_embeddings: false  # whether to L2-normalize user and item embeddings
+        eval_interval: 5  # run metrics every epoch
+        metrics:
+        - ["hr", {}]
+        - ["ndcg", {}]
+        - ["popularity", {p: [0.1, 0.2]}]
+        - ["unpopularity", {p: [0.2, 0.4]}]
+        model_loss_weight: 1.0
+        top_k: [1, 5, 10]
+
diff --git a/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_xs.yaml b/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_xs.yaml
new file mode 100644
index 0000000..1d79e03
--- /dev/null
+++ b/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_xs.yaml
@@ -0,0 +1,77 @@
+# SASRec (XS) + BCE on Amazon-2018
+
+# global settings
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+test_eval: false  # whether to run evaluation on the test set instead of validation set
+save_predictions: false  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: seqrec
+
+    # specific path to interaction data file for Amazon-2018 dataset.
+    search__interaction_data_path:
+    - ./data/amazon2018/proc/user2item.pkl
+    
+    max_seq_length: 100
+
+# collator settings
+collator:
+    type: seqrec
+
+    num_negative_samples: 16
+    negative_sampling_strategy: uniform
+
+# model settings
+model:
+    type: sasrec
+
+    config:
+        # base model parameters
+        search__hidden_size: [128]
+        search__num_attention_heads: [2]
+        search__num_hidden_layers: [2]
+
+        # subclass model parameters
+        search__attention_dropout: [0.1]
+
+# trainer settings
+trainer:
+    type: bce
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+
+        # training arguments - Optimization & schedule
+        search__num_train_epochs: [100]
+        per_device_train_batch_size: 256
+        per_device_eval_batch_size: 512
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        search__learning_rate: [1.0e-3]
+        search__weight_decay: [0.1]
+        lr_scheduler_type: cosine
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        metric_for_best_model: ndcg@5  # should exist in the metrics
+
+        # training arguments - Parallelism & precision
+        dataloader_num_workers: 4
+        gradient_checkpointing: true
+        bf16: true
+        tf32: false
+
+        # base trainer parameters
+        norm_embeddings: false  # whether to L2-normalize user and item embeddings
+        eval_interval: 5  # run metrics every epoch
+        metrics:
+        - ["hr", {}]
+        - ["ndcg", {}]
+        - ["popularity", {p: [0.1, 0.2]}]
+        - ["unpopularity", {p: [0.2, 0.4]}]
+        model_loss_weight: 1.0
+        top_k: [1, 5, 10]
+
diff --git a/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_xxl.yaml b/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_xxl.yaml
new file mode 100644
index 0000000..5661ebc
--- /dev/null
+++ b/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_xxl.yaml
@@ -0,0 +1,77 @@
+# SASRec (XXL) + BCE on Amazon-2018
+
+# global settings
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+test_eval: false  # whether to run evaluation on the test set instead of validation set
+save_predictions: false  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: seqrec
+
+    # specific path to interaction data file for Amazon-2018 dataset.
+    search__interaction_data_path:
+    - ./data/amazon2018/proc/user2item.pkl
+    
+    max_seq_length: 100
+
+# collator settings
+collator:
+    type: seqrec
+
+    num_negative_samples: 16
+    negative_sampling_strategy: uniform
+
+# model settings
+model:
+    type: sasrec
+
+    config:
+        # base model parameters
+        search__hidden_size: [1536]
+        search__num_attention_heads: [24]
+        search__num_hidden_layers: [12]
+
+        # subclass model parameters
+        search__attention_dropout: [0.1]
+
+# trainer settings
+trainer:
+    type: bce
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+
+        # training arguments - Optimization & schedule
+        search__num_train_epochs: [100]
+        per_device_train_batch_size: 256
+        per_device_eval_batch_size: 512
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        search__learning_rate: [1.0e-4]
+        search__weight_decay: [0.1]
+        lr_scheduler_type: cosine
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        metric_for_best_model: ndcg@5  # should exist in the metrics
+
+        # training arguments - Parallelism & precision
+        dataloader_num_workers: 4
+        gradient_checkpointing: true
+        bf16: true
+        tf32: false
+
+        # base trainer parameters
+        norm_embeddings: false  # whether to L2-normalize user and item embeddings
+        eval_interval: 5  # run metrics every epoch
+        metrics:
+        - ["hr", {}]
+        - ["ndcg", {}]
+        - ["popularity", {p: [0.1, 0.2]}]
+        - ["unpopularity", {p: [0.2, 0.4]}]
+        model_loss_weight: 1.0
+        top_k: [1, 5, 10]
+
diff --git a/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_xxs.yaml b/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_xxs.yaml
new file mode 100644
index 0000000..691c9af
--- /dev/null
+++ b/scripts/configs/seqrec/scaling/sasrec_bce_amazons/sasrec_bce_amazon_xxs.yaml
@@ -0,0 +1,77 @@
+# SASRec (XXS) + BCE on Amazon-2018
+
+# global settings
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+test_eval: false  # whether to run evaluation on the test set instead of validation set
+save_predictions: false  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: seqrec
+
+    # specific path to interaction data file for Amazon-2018 dataset.
+    search__interaction_data_path:
+    - ./data/amazon2018/proc/user2item.pkl
+    
+    max_seq_length: 100
+
+# collator settings
+collator:
+    type: seqrec
+
+    num_negative_samples: 16
+    negative_sampling_strategy: uniform
+
+# model settings
+model:
+    type: sasrec
+
+    config:
+        # base model parameters
+        search__hidden_size: [64]
+        search__num_attention_heads: [1]
+        search__num_hidden_layers: [1]
+
+        # subclass model parameters
+        search__attention_dropout: [0.1]
+
+# trainer settings
+trainer:
+    type: bce
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+
+        # training arguments - Optimization & schedule
+        search__num_train_epochs: [100]
+        per_device_train_batch_size: 256
+        per_device_eval_batch_size: 512
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        search__learning_rate: [1.0e-3]
+        search__weight_decay: [0.1]
+        lr_scheduler_type: cosine
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        metric_for_best_model: ndcg@5  # should exist in the metrics
+
+        # training arguments - Parallelism & precision
+        dataloader_num_workers: 4
+        gradient_checkpointing: true
+        bf16: true
+        tf32: false
+
+        # base trainer parameters
+        norm_embeddings: false  # whether to L2-normalize user and item embeddings
+        eval_interval: 5  # run metrics every epoch
+        metrics:
+        - ["hr", {}]
+        - ["ndcg", {}]
+        - ["popularity", {p: [0.1, 0.2]}]
+        - ["unpopularity", {p: [0.2, 0.4]}]
+        model_loss_weight: 1.0
+        top_k: [1, 5, 10]
+
diff --git a/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_b.yaml b/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_b.yaml
new file mode 100644
index 0000000..4cf577f
--- /dev/null
+++ b/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_b.yaml
@@ -0,0 +1,76 @@
+# SASRec (Base) + BCE on Movielens-20M
+
+# global settings
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+test_eval: false  # whether to run evaluation on the test set instead of validation set
+save_predictions: false  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: seqrec
+
+    # specific path to interaction data file for Movielens-20M dataset.
+    search__interaction_data_path:
+    - ./data/movielens-20m/proc/user2item.pkl
+    
+    max_seq_length: 200
+
+# collator settings
+collator:
+    type: seqrec
+
+    num_negative_samples: 32
+    negative_sampling_strategy: uniform
+
+# model settings
+model:
+    type: sasrec
+
+    config:
+        # base model parameters
+        search__hidden_size: [512]
+        search__num_attention_heads: [8]
+        search__num_hidden_layers: [6]
+
+        # subclass model parameters
+        search__attention_dropout: [0.1, 0.2, 0.4]
+
+# trainer settings
+trainer:
+    type: bce
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+
+        # training arguments - Optimization & schedule
+        search__num_train_epochs: [100]
+        per_device_train_batch_size: 128
+        per_device_eval_batch_size: 256
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        search__learning_rate: [5.0e-4, 1.0e-3]
+        search__weight_decay: [0.1, 0.2, 0.05]
+        lr_scheduler_type: cosine
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        metric_for_best_model: ndcg@5  # should exist in the metrics
+
+        # training arguments - Parallelism & precision
+        dataloader_num_workers: 4
+        gradient_checkpointing: true
+        bf16: true
+        tf32: false
+
+        # base trainer parameters
+        norm_embeddings: false  # whether to L2-normalize user and item embeddings
+        eval_interval: 5  # run metrics every epoch
+        metrics:
+        - ["hr", {}]
+        - ["ndcg", {}]
+        - ["popularity", {p: [0.05, 0.1]}]
+        - ["unpopularity", {p: [0.6, 0.8]}]
+        model_loss_weight: 1.0
+        top_k: [1, 5, 10]
diff --git a/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_l.yaml b/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_l.yaml
new file mode 100644
index 0000000..0ae974a
--- /dev/null
+++ b/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_l.yaml
@@ -0,0 +1,76 @@
+# SASRec (Large) + BCE on Movielens-20M
+
+# global settings
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+test_eval: false  # whether to run evaluation on the test set instead of validation set
+save_predictions: false  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: seqrec
+
+    # specific path to interaction data file for Movielens-20M dataset.
+    search__interaction_data_path:
+    - ./data/movielens-20m/proc/user2item.pkl
+    
+    max_seq_length: 200
+
+# collator settings
+collator:
+    type: seqrec
+
+    num_negative_samples: 32
+    negative_sampling_strategy: uniform
+
+# model settings
+model:
+    type: sasrec
+
+    config:
+        # base model parameters
+        search__hidden_size: [768]
+        search__num_attention_heads: [12]
+        search__num_hidden_layers: [8]
+
+        # subclass model parameters
+        search__attention_dropout: [0.2]
+
+# trainer settings
+trainer:
+    type: bce
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+
+        # training arguments - Optimization & schedule
+        search__num_train_epochs: [100]
+        per_device_train_batch_size: 128
+        per_device_eval_batch_size: 256
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        search__learning_rate: [5.0e-4]
+        search__weight_decay: [0.1]
+        lr_scheduler_type: cosine
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        metric_for_best_model: ndcg@5  # should exist in the metrics
+
+        # training arguments - Parallelism & precision
+        dataloader_num_workers: 4
+        gradient_checkpointing: true
+        bf16: true
+        tf32: false
+
+        # base trainer parameters
+        norm_embeddings: false  # whether to L2-normalize user and item embeddings
+        eval_interval: 5  # run metrics every epoch
+        metrics:
+        - ["hr", {}]
+        - ["ndcg", {}]
+        - ["popularity", {p: [0.05, 0.1]}]
+        - ["unpopularity", {p: [0.6, 0.8]}]
+        model_loss_weight: 1.0
+        top_k: [1, 5, 10]
diff --git a/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_s.yaml b/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_s.yaml
new file mode 100644
index 0000000..321198e
--- /dev/null
+++ b/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_s.yaml
@@ -0,0 +1,76 @@
+# SASRec (Small) + BCE on Movielens-20M
+
+# global settings
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+test_eval: false  # whether to run evaluation on the test set instead of validation set
+save_predictions: false  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: seqrec
+
+    # specific path to interaction data file for Movielens-20M dataset.
+    search__interaction_data_path:
+    - ./data/movielens-20m/proc/user2item.pkl
+    
+    max_seq_length: 200
+
+# collator settings
+collator:
+    type: seqrec
+
+    num_negative_samples: 32
+    negative_sampling_strategy: uniform
+
+# model settings
+model:
+    type: sasrec
+
+    config:
+        # base model parameters
+        search__hidden_size: [256]
+        search__num_attention_heads: [4]
+        search__num_hidden_layers: [4]
+
+        # subclass model parameters
+        search__attention_dropout: [0, 0.2]
+
+# trainer settings
+trainer:
+    type: bce
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+
+        # training arguments - Optimization & schedule
+        search__num_train_epochs: [100]
+        per_device_train_batch_size: 128
+        per_device_eval_batch_size: 256
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        search__learning_rate: [1.0e-3]
+        search__weight_decay: [0.1, 0.05, 0.01]
+        lr_scheduler_type: cosine
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        metric_for_best_model: ndcg@5  # should exist in the metrics
+
+        # training arguments - Parallelism & precision
+        dataloader_num_workers: 4
+        gradient_checkpointing: true
+        bf16: true
+        tf32: false
+
+        # base trainer parameters
+        norm_embeddings: false  # whether to L2-normalize user and item embeddings
+        eval_interval: 5  # run metrics every epoch
+        metrics:
+        - ["hr", {}]
+        - ["ndcg", {}]
+        - ["popularity", {p: [0.05, 0.1]}]
+        - ["unpopularity", {p: [0.6, 0.8]}]
+        model_loss_weight: 1.0
+        top_k: [1, 5, 10]
diff --git a/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_xl.yaml b/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_xl.yaml
new file mode 100644
index 0000000..32906d3
--- /dev/null
+++ b/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_xl.yaml
@@ -0,0 +1,76 @@
+# SASRec (XL) + BCE on Movielens-20M
+
+# global settings
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+test_eval: false  # whether to run evaluation on the test set instead of validation set
+save_predictions: false  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: seqrec
+
+    # specific path to interaction data file for Movielens-20M dataset.
+    search__interaction_data_path:
+    - ./data/movielens-20m/proc/user2item.pkl
+    
+    max_seq_length: 200
+
+# collator settings
+collator:
+    type: seqrec
+
+    num_negative_samples: 32
+    negative_sampling_strategy: uniform
+
+# model settings
+model:
+    type: sasrec
+
+    config:
+        # base model parameters
+        search__hidden_size: [1024]
+        search__num_attention_heads: [16]
+        search__num_hidden_layers: [10]
+
+        # subclass model parameters
+        search__attention_dropout: [0.2]
+
+# trainer settings
+trainer:
+    type: bce
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+
+        # training arguments - Optimization & schedule
+        search__num_train_epochs: [100]
+        per_device_train_batch_size: 128
+        per_device_eval_batch_size: 256
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        search__learning_rate: [1.0e-4]
+        search__weight_decay: [0.1]
+        lr_scheduler_type: cosine
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        metric_for_best_model: ndcg@5  # should exist in the metrics
+
+        # training arguments - Parallelism & precision
+        dataloader_num_workers: 4
+        gradient_checkpointing: true
+        bf16: true
+        tf32: false
+
+        # base trainer parameters
+        norm_embeddings: false  # whether to L2-normalize user and item embeddings
+        eval_interval: 5  # run metrics every epoch
+        metrics:
+        - ["hr", {}]
+        - ["ndcg", {}]
+        - ["popularity", {p: [0.05, 0.1]}]
+        - ["unpopularity", {p: [0.6, 0.8]}]
+        model_loss_weight: 1.0
+        top_k: [1, 5, 10]
diff --git a/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_xs.yaml b/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_xs.yaml
new file mode 100644
index 0000000..aa5bf00
--- /dev/null
+++ b/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_xs.yaml
@@ -0,0 +1,76 @@
+# SASRec (XS) + BCE on Movielens-20M
+
+# global settings
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+test_eval: false  # whether to run evaluation on the test set instead of validation set
+save_predictions: false  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: seqrec
+
+    # specific path to interaction data file for Movielens-20M dataset.
+    search__interaction_data_path:
+    - ./data/movielens-20m/proc/user2item.pkl
+    
+    max_seq_length: 200
+
+# collator settings
+collator:
+    type: seqrec
+
+    num_negative_samples: 32
+    negative_sampling_strategy: uniform
+
+# model settings
+model:
+    type: sasrec
+
+    config:
+        # base model parameters
+        search__hidden_size: [128]
+        search__num_attention_heads: [2]
+        search__num_hidden_layers: [2]
+
+        # subclass model parameters
+        search__attention_dropout: [0, 0.2]
+
+# trainer settings
+trainer:
+    type: bce
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+
+        # training arguments - Optimization & schedule
+        search__num_train_epochs: [100]
+        per_device_train_batch_size: 128
+        per_device_eval_batch_size: 256
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        search__learning_rate: [1.0e-3]
+        search__weight_decay: [0.1, 0.05, 0.01]
+        lr_scheduler_type: cosine
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        metric_for_best_model: ndcg@5  # should exist in the metrics
+
+        # training arguments - Parallelism & precision
+        dataloader_num_workers: 4
+        gradient_checkpointing: true
+        bf16: true
+        tf32: false
+
+        # base trainer parameters
+        norm_embeddings: false  # whether to L2-normalize user and item embeddings
+        eval_interval: 5  # run metrics every epoch
+        metrics:
+        - ["hr", {}]
+        - ["ndcg", {}]
+        - ["popularity", {p: [0.05, 0.1]}]
+        - ["unpopularity", {p: [0.6, 0.8]}]
+        model_loss_weight: 1.0
+        top_k: [1, 5, 10]
diff --git a/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_xxl.yaml b/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_xxl.yaml
new file mode 100644
index 0000000..d188c85
--- /dev/null
+++ b/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_xxl.yaml
@@ -0,0 +1,77 @@
+# SASRec (XXL) + BCE on Movielens-20M
+
+# global settings
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+test_eval: false  # whether to run evaluation on the test set instead of validation set
+save_predictions: false  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: seqrec
+
+    # specific path to interaction data file for Movielens-20M dataset.
+    search__interaction_data_path:
+    - ./data/movielens-20m/proc/user2item.pkl
+    
+    max_seq_length: 200
+
+# collator settings
+collator:
+    type: seqrec
+
+    num_negative_samples: 32
+    negative_sampling_strategy: uniform
+
+# model settings
+model:
+    type: sasrec
+
+    config:
+        # base model parameters
+        search__hidden_size: [1536]
+        search__num_attention_heads: [24]
+        search__num_hidden_layers: [12]
+
+        # subclass model parameters
+        search__attention_dropout: [0.2]
+
+# trainer settings
+trainer:
+    type: bce
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+
+        # training arguments - Optimization & schedule
+        search__num_train_epochs: [100]
+        per_device_train_batch_size: 128
+        per_device_eval_batch_size: 256
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        search__learning_rate: [1.0e-4]
+        search__weight_decay: [0.1]
+        lr_scheduler_type: cosine
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        metric_for_best_model: ndcg@5  # should exist in the metrics
+
+        # training arguments - Parallelism & precision
+        dataloader_num_workers: 4
+        gradient_checkpointing: true
+        bf16: true
+        tf32: false
+
+        # base trainer parameters
+        norm_embeddings: false  # whether to L2-normalize user and item embeddings
+        eval_interval: 5  # run metrics every epoch
+        metrics:
+        - ["hr", {}]
+        - ["ndcg", {}]
+        - ["popularity", {p: [0.05, 0.1]}]
+        - ["unpopularity", {p: [0.6, 0.8]}]
+        model_loss_weight: 1.0
+        top_k: [1, 5, 10]
+
diff --git a/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_xxs.yaml b/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_xxs.yaml
new file mode 100644
index 0000000..cc522b6
--- /dev/null
+++ b/scripts/configs/seqrec/scaling/sasrec_bce_movielens/sasrec_bce_movielens_xxs.yaml
@@ -0,0 +1,76 @@
+# SASRec (XXS) + BCE on Movielens-20M
+
+# global settings
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+test_eval: false  # whether to run evaluation on the test set instead of validation set
+save_predictions: false  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: seqrec
+
+    # specific path to interaction data file for Movielens-20M dataset.
+    search__interaction_data_path:
+    - ./data/movielens-20m/proc/user2item.pkl
+    
+    max_seq_length: 200
+
+# collator settings
+collator:
+    type: seqrec
+
+    num_negative_samples: 32
+    negative_sampling_strategy: uniform
+
+# model settings
+model:
+    type: sasrec
+
+    config:
+        # base model parameters
+        search__hidden_size: [64]
+        search__num_attention_heads: [1]
+        search__num_hidden_layers: [1]
+
+        # subclass model parameters
+        search__attention_dropout: [0, 0.2]
+
+# trainer settings
+trainer:
+    type: bce
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+
+        # training arguments - Optimization & schedule
+        search__num_train_epochs: [100]
+        per_device_train_batch_size: 128
+        per_device_eval_batch_size: 256
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        search__learning_rate: [1.0e-3]
+        search__weight_decay: [0.1, 0.05, 0.01]
+        lr_scheduler_type: cosine
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        metric_for_best_model: ndcg@5  # should exist in the metrics
+
+        # training arguments - Parallelism & precision
+        dataloader_num_workers: 4
+        gradient_checkpointing: true
+        bf16: true
+        tf32: false
+
+        # base trainer parameters
+        norm_embeddings: false  # whether to L2-normalize user and item embeddings
+        eval_interval: 5  # run metrics every epoch
+        metrics:
+        - ["hr", {}]
+        - ["ndcg", {}]
+        - ["popularity", {p: [0.05, 0.1]}]
+        - ["unpopularity", {p: [0.6, 0.8]}]
+        model_loss_weight: 1.0
+        top_k: [1, 5, 10]