Tiny-Snow · Tiny-Snow · Jan 27, 2026 · Jan 25, 2026 · Jan 27, 2026
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,8 @@ __pycache__/
 # Data, scripts & artifacts
 data
 data/
+.hf-models
+.hf-models/
 outputs/
 logs/
 scripts_running/

diff --git a/scripts/configs/quantizer/rqvae.yaml b/scripts/configs/quantizer/rqvae.yaml
@@ -0,0 +1,75 @@
+# An example config file for RQ-VAE on Movielens-1M dataset with grid search
+# If you want to add hyperparameter search space, use the "search__" prefix before the parameter name.
+
+# global settings
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+save_predictions: true  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: quantizer
+
+    interaction_data_path: /path/to/movielens-1m/proc/user2item.pkl  # TODO: path to interaction data file
+    textual_data_path: /path/to/movielens-1m/proc/item2title.pkl  # TODO: path to textual data file
+
+    lm_encoder_type: sentence_t5
+    lm_encoder_path: /path/to/sentence-transformers/sentence-t5-base  # TODO: path to pretrained language model encoder
+
+# collator settings
+collator:
+    type: quantizer
+
+# model settings
+model:
+    type: rqvae
+
+    config:
+        # base model parameters
+        hidden_sizes: [512, 256, 128]
+        num_codebooks: 3
+        codebook_size: 256
+        codebook_dim: 32
+
+        # subclass model parameters
+        kmeans_init: true
+        kmeans_max_iter: 10
+
+# trainer settings
+trainer:
+    type: rqvae
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+
+        # training arguments - Optimization & schedule
+        num_train_epochs: 20000
+        per_device_train_batch_size: 1024
+        per_device_eval_batch_size: 1024
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        learning_rate: 1.0e-3
+        weight_decay: 0.1
+        lr_scheduler_type: linear
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        metric_for_best_model: eval_loss  # should exist in the metrics
+        greater_is_better: false  # use loss to evaluate the best model
+
+        # training arguments - Parallelism & precision
+        bf16: false
+        tf32: true
+
+        # base trainer parameters
+        eval_interval: 100  # run metrics every epoch
+        train_stop_epoch: -1  # by default, do not stop training early
+        metrics:
+        - ["codebook_usage", {}]
+        - ["code_collision", {}]
+        codebook_loss_weight: 1.0
+        commitment_loss_weight: 0.25
+        model_loss_weight: 0.0
+
+        # subclass trainer parameters
diff --git a/scripts/configs/quantizer/template.yaml b/scripts/configs/quantizer/template.yaml
@@ -0,0 +1,103 @@
+# A template config file for quantizer training
+# If you want to add hyperparameter search space, use the "search__" prefix before the parameter name.
+
+# global settings
+seed: 42
+output_dir: null  # TODO: output directory to save model checkpoints, logs, and results
+pretrained_ckpt: null  # optional path to a pretrained checkpoint to load
+save_predictions: true  # whether to save the predictions on the test set
+
+# dataset settings
+dataset:
+    type: quantizer
+
+    interaction_data_path: null  # TODO: path to interaction data file
+    textual_data_path: null  # TODO: path to textual data file
+
+    lm_encoder_type: sentence_t5
+    lm_encoder_path: null  # TODO: path to pretrained language model encoder
+
+    aux_item_embeddings_path: null  # TODO: path to auxiliary item embeddings file (supposed to be .npy file)
+
+# collator settings
+collator:
+    type: quantizer
+
+    # no default parameters for quantizer collator at the moment
+
+# model settings
+model:
+    type: rqvae
+
+    config:
+        # base model parameters
+        hidden_sizes: [512, 256, 128]
+        num_codebooks: 3
+        codebook_size: 256
+        codebook_dim: 32
+
+        # subclass model parameters
+        kmeans_init: true
+        kmeans_max_iter: 10
+
+# trainer settings
+trainer:
+    type: rqvae
+
+    config:
+        # training arguments - Run control
+        do_train: true
+        do_eval: true
+        do_predict: true
+        overwrite_output_dir: true
+        remove_unused_columns: false
+
+        # training arguments - Optimization & schedule
+        num_train_epochs: 20000
+        per_device_train_batch_size: 1024
+        per_device_eval_batch_size: 1024
+        gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
+        learning_rate: 1.0e-3
+        weight_decay: 0.1
+        max_grad_norm: 1.0
+        optim: adamw_torch
+        lr_scheduler_type: linear
+        warmup_ratio: 0.05
+
+        # training arguments - Evaluation & checkpointing
+        eval_strategy: epoch
+        save_strategy: epoch
+        eval_delay: 0  # skip warmup
+        eval_accumulation_steps: 1
+        save_total_limit: 1  # keep only the best checkpoint
+        load_best_model_at_end: true  # load the best model when finished training
+        metric_for_best_model: eval_loss  # should exist in the metrics
+        greater_is_better: false  # use loss to evaluate the best model
+        prediction_loss_only: false
+        save_safetensors: true
+
+        # training arguments - Parallelism & precision
+        dataloader_num_workers: 0
+        dataloader_pin_memory: true
+        dataloader_drop_last: false
+        ddp_find_unused_parameters: true
+        ddp_broadcast_buffers: false
+        gradient_checkpointing: false
+        bf16: false
+        tf32: true
+
+        # training arguments - Logging / tracking
+        logging_strategy: epoch
+        report_to: ["tensorboard"]
+
+        # base trainer parameters
+        eval_interval: 100  # run metrics every epoch
+        train_stop_epoch: -1  # by default, do not stop training early
+        metrics:
+        - ["codebook_usage", {}]
+        - ["code_collision", {}]
+        codebook_loss_weight: 1.0
+        commitment_loss_weight: 0.25
+        model_loss_weight: 0.0
+
+        # subclass trainer parameters
diff --git a/scripts/configs/seqrec/template.yaml b/scripts/configs/seqrec/template.yaml
@@ -1,4 +1,4 @@
-# Example config file for sequence recommendation tasks
+# A template config file for sequence recommendation tasks
 # If you want to add hyperparameter search space, use the "search__" prefix before the parameter name.
 
 # global settings
@@ -50,7 +50,6 @@ trainer:
 
         # training arguments - Optimization & schedule
         num_train_epochs: 200
-        train_stop_epoch: -1  # by default, do not stop training early
         per_device_train_batch_size: 512
         per_device_eval_batch_size: 1024
         gradient_accumulation_steps: 1  # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
@@ -90,6 +89,7 @@ trainer:
         # base trainer parameters
         norm_embeddings: false  # whether to L2-normalize user and item embeddings
         eval_interval: 5  # run metrics every epoch
+        train_stop_epoch: -1  # by default, do not stop training early
         metrics:
         - ["hr", {}]
         - ["ndcg", {}]

diff --git a/src/genrec/datasets/__init__.py b/src/genrec/datasets/__init__.py
@@ -52,3 +52,13 @@
     "SeqRecDataset",
     "SeqRecExample",
 ]
+
+from .modules import LMEncoder, LMEncoderFactory, NegativeSampler, NegativeSamplerFactory, PrefixTree
+
+__all__ += [
+    "LMEncoder",
+    "LMEncoderFactory",
+    "NegativeSampler",
+    "NegativeSamplerFactory",
+    "PrefixTree",
+]
diff --git a/src/genrec/datasets/base.py b/src/genrec/datasets/base.py
@@ -171,6 +171,7 @@ def __init__(
         sid_cache: Optional[Int[np.ndarray, "I+1 C"]] = None,
         textual_data_path: Optional[Union[pd.DataFrame, str, Path]] = None,
         lm_encoder: Optional[LMEncoder] = None,
+        **kwargs: Any,
     ) -> None:
         """Initialises the dataset and materialises user-level metadata.
 
@@ -188,6 +189,7 @@ def __init__(
                 pickle file with `ItemID` and `Title` columns.
             lm_encoder (Optional[LMEncoder]): Optional encoder used to transform item titles into
                 dense embeddings.
+            **kwargs (Any): Additional keyword arguments for the dataset.
         """
         if split not in {
             "train",
@@ -489,6 +491,7 @@ def __init__(
         no_pad_keys: Dict[str, type],
         pad_values: Dict[str, np.generic],
         seed: int = 42,
+        **kwargs: Any,
     ) -> None:
         """Configures the collator.
 
@@ -498,6 +501,7 @@ def __init__(
             pad_values (Dict[str, np.generic]): Padding values per field, e.g., {"field1": 0,
                 "field2": -100}. If a field is missing, defaults to 0.
             seed (int): Random seed for the collator's internal RNG.
+            **kwargs (Any): Additional keyword arguments for the collator.
         """
         SeedWorkerMixin.__init__(self, global_seed=seed)
 

diff --git a/src/genrec/datasets/dataset_genrec.py b/src/genrec/datasets/dataset_genrec.py
@@ -77,6 +77,23 @@ def __init__(
         textual_data_path: Optional[Union[pd.DataFrame, str, Path]] = None,
         lm_encoder: Optional[LMEncoder] = None,
     ) -> None:
+        """Initialises the dataset and materialises user-level metadata.
+
+        Args:
+            interaction_data_path (Union[pd.DataFrame, str, Path]): Pandas DataFrame or path to a
+                pickle file containing `UserID` and `ItemID` columns. We assume that the `UserID`
+                begins from 0 and that `ItemID` begins from 1, both being contiguous integers. The
+                `ItemID` of 0 is reserved for padding.
+            split (DatasetSplitLiteral): Dataset split controlling example generation strategy.
+            max_seq_length (int): Maximum length of interaction histories.
+            min_seq_length (int): Minimum length of interaction histories.
+            sid_cache (Optional[Int[np.ndarray, "I+1 C"]]): Optional mapping from item ID to SID
+                sequence, stored as numpy arrays.
+            textual_data_path (Optional[Union[pd.DataFrame, str, Path]]): Optional DataFrame or
+                pickle file with `ItemID` and `Title` columns.
+            lm_encoder (Optional[LMEncoder]): Optional encoder used to transform item titles into
+                dense embeddings.
+        """
         super().__init__(
             interaction_data_path,
             split,