Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ __pycache__/
# Data, scripts & artifacts
data
data/
.hf-models
.hf-models/
outputs/
logs/
scripts_running/
Expand Down
75 changes: 75 additions & 0 deletions scripts/configs/quantizer/rqvae.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# An example config file for RQ-VAE on Movielens-1M dataset with grid search
# If you want to add hyperparameter search space, use the "search__" prefix before the parameter name.

# global settings
pretrained_ckpt: null # optional path to a pretrained checkpoint to load
save_predictions: true # whether to save the predictions on the test set

# dataset settings
dataset:
type: quantizer

interaction_data_path: /path/to/movielens-1m/proc/user2item.pkl # TODO: path to interaction data file
textual_data_path: /path/to/movielens-1m/proc/item2title.pkl # TODO: path to textual data file

lm_encoder_type: sentence_t5
lm_encoder_path: /path/to/sentence-transformers/sentence-t5-base # TODO: path to pretrained language model encoder

# collator settings
collator:
type: quantizer

# model settings
model:
type: rqvae

config:
# base model parameters
hidden_sizes: [512, 256, 128]
num_codebooks: 3
codebook_size: 256
codebook_dim: 32

# subclass model parameters
kmeans_init: true
kmeans_max_iter: 10

# trainer settings
trainer:
type: rqvae

config:
# training arguments - Run control
do_train: true
do_eval: true
do_predict: true

# training arguments - Optimization & schedule
num_train_epochs: 20000
per_device_train_batch_size: 1024
per_device_eval_batch_size: 1024
gradient_accumulation_steps: 1 # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
learning_rate: 1.0e-3
weight_decay: 0.1
lr_scheduler_type: linear
warmup_ratio: 0.05

# training arguments - Evaluation & checkpointing
metric_for_best_model: eval_loss # should exist in the metrics
greater_is_better: false # use loss to evaluate the best model

# training arguments - Parallelism & precision
bf16: false
tf32: true

# base trainer parameters
eval_interval: 100 # run metrics every epoch
train_stop_epoch: -1 # by default, do not stop training early
metrics:
- ["codebook_usage", {}]
- ["code_collision", {}]
codebook_loss_weight: 1.0
commitment_loss_weight: 0.25
model_loss_weight: 0.0

# subclass trainer parameters
103 changes: 103 additions & 0 deletions scripts/configs/quantizer/template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# A template config file for quantizer training
# If you want to add hyperparameter search space, use the "search__" prefix before the parameter name.

# global settings
seed: 42
output_dir: null # TODO: output directory to save model checkpoints, logs, and results
pretrained_ckpt: null # optional path to a pretrained checkpoint to load
save_predictions: true # whether to save the predictions on the test set

# dataset settings
dataset:
type: quantizer

interaction_data_path: null # TODO: path to interaction data file
textual_data_path: null # TODO: path to textual data file

lm_encoder_type: sentence_t5
lm_encoder_path: null # TODO: path to pretrained language model encoder

aux_item_embeddings_path: null # TODO: path to auxiliary item embeddings file (supposed to be .npy file)

# collator settings
collator:
type: quantizer

# no default parameters for quantizer collator at the moment

# model settings
model:
type: rqvae

config:
# base model parameters
hidden_sizes: [512, 256, 128]
num_codebooks: 3
codebook_size: 256
codebook_dim: 32

# subclass model parameters
kmeans_init: true
kmeans_max_iter: 10

# trainer settings
trainer:
type: rqvae

config:
# training arguments - Run control
do_train: true
do_eval: true
do_predict: true
overwrite_output_dir: true
remove_unused_columns: false

# training arguments - Optimization & schedule
num_train_epochs: 20000
per_device_train_batch_size: 1024
per_device_eval_batch_size: 1024
gradient_accumulation_steps: 1 # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
learning_rate: 1.0e-3
weight_decay: 0.1
max_grad_norm: 1.0
optim: adamw_torch
lr_scheduler_type: linear
warmup_ratio: 0.05

# training arguments - Evaluation & checkpointing
eval_strategy: epoch
save_strategy: epoch
eval_delay: 0 # skip warmup
eval_accumulation_steps: 1
save_total_limit: 1 # keep only the best checkpoint
load_best_model_at_end: true # load the best model when finished training
metric_for_best_model: eval_loss # should exist in the metrics
greater_is_better: false # use loss to evaluate the best model
prediction_loss_only: false
save_safetensors: true

# training arguments - Parallelism & precision
dataloader_num_workers: 0
dataloader_pin_memory: true
dataloader_drop_last: false
ddp_find_unused_parameters: true
ddp_broadcast_buffers: false
gradient_checkpointing: false
bf16: false
tf32: true

# training arguments - Logging / tracking
logging_strategy: epoch
report_to: ["tensorboard"]

# base trainer parameters
eval_interval: 100 # run metrics every epoch
train_stop_epoch: -1 # by default, do not stop training early
metrics:
- ["codebook_usage", {}]
- ["code_collision", {}]
codebook_loss_weight: 1.0
commitment_loss_weight: 0.25
model_loss_weight: 0.0

# subclass trainer parameters
4 changes: 2 additions & 2 deletions scripts/configs/seqrec/template.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Example config file for sequence recommendation tasks
# A template config file for sequence recommendation tasks
# If you want to add hyperparameter search space, use the "search__" prefix before the parameter name.

# global settings
Expand Down Expand Up @@ -50,7 +50,6 @@ trainer:

# training arguments - Optimization & schedule
num_train_epochs: 200
train_stop_epoch: -1 # by default, do not stop training early
per_device_train_batch_size: 512
per_device_eval_batch_size: 1024
gradient_accumulation_steps: 1 # batch_size = per_device_train_batch_size * num_devices * gradient_accumulation_steps
Expand Down Expand Up @@ -90,6 +89,7 @@ trainer:
# base trainer parameters
norm_embeddings: false # whether to L2-normalize user and item embeddings
eval_interval: 5 # run metrics every epoch
train_stop_epoch: -1 # by default, do not stop training early
metrics:
- ["hr", {}]
- ["ndcg", {}]
Expand Down
10 changes: 10 additions & 0 deletions src/genrec/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,13 @@
"SeqRecDataset",
"SeqRecExample",
]

from .modules import LMEncoder, LMEncoderFactory, NegativeSampler, NegativeSamplerFactory, PrefixTree

__all__ += [
"LMEncoder",
"LMEncoderFactory",
"NegativeSampler",
"NegativeSamplerFactory",
"PrefixTree",
]
4 changes: 4 additions & 0 deletions src/genrec/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ def __init__(
sid_cache: Optional[Int[np.ndarray, "I+1 C"]] = None,
textual_data_path: Optional[Union[pd.DataFrame, str, Path]] = None,
lm_encoder: Optional[LMEncoder] = None,
**kwargs: Any,
) -> None:
"""Initialises the dataset and materialises user-level metadata.

Expand All @@ -188,6 +189,7 @@ def __init__(
pickle file with `ItemID` and `Title` columns.
lm_encoder (Optional[LMEncoder]): Optional encoder used to transform item titles into
dense embeddings.
**kwargs (Any): Additional keyword arguments for the dataset.
"""
if split not in {
"train",
Expand Down Expand Up @@ -489,6 +491,7 @@ def __init__(
no_pad_keys: Dict[str, type],
pad_values: Dict[str, np.generic],
seed: int = 42,
**kwargs: Any,
) -> None:
"""Configures the collator.

Expand All @@ -498,6 +501,7 @@ def __init__(
pad_values (Dict[str, np.generic]): Padding values per field, e.g., {"field1": 0,
"field2": -100}. If a field is missing, defaults to 0.
seed (int): Random seed for the collator's internal RNG.
**kwargs (Any): Additional keyword arguments for the collator.
"""
SeedWorkerMixin.__init__(self, global_seed=seed)

Expand Down
17 changes: 17 additions & 0 deletions src/genrec/datasets/dataset_genrec.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,23 @@ def __init__(
textual_data_path: Optional[Union[pd.DataFrame, str, Path]] = None,
lm_encoder: Optional[LMEncoder] = None,
) -> None:
"""Initialises the dataset and materialises user-level metadata.

Args:
interaction_data_path (Union[pd.DataFrame, str, Path]): Pandas DataFrame or path to a
pickle file containing `UserID` and `ItemID` columns. We assume that the `UserID`
begins from 0 and that `ItemID` begins from 1, both being contiguous integers. The
`ItemID` of 0 is reserved for padding.
split (DatasetSplitLiteral): Dataset split controlling example generation strategy.
max_seq_length (int): Maximum length of interaction histories.
min_seq_length (int): Minimum length of interaction histories.
sid_cache (Optional[Int[np.ndarray, "I+1 C"]]): Optional mapping from item ID to SID
sequence, stored as numpy arrays.
textual_data_path (Optional[Union[pd.DataFrame, str, Path]]): Optional DataFrame or
pickle file with `ItemID` and `Title` columns.
lm_encoder (Optional[LMEncoder]): Optional encoder used to transform item titles into
dense embeddings.
"""
super().__init__(
interaction_data_path,
split,
Expand Down
Loading