bdeanhardt · gabrielfnayres · Nov 11, 2024 · Nov 12, 2024 · Nov 12, 2024 · Nov 13, 2024
diff --git a/README.md b/README.md
@@ -64,8 +64,9 @@ Developers should set up `pre-commit` as well with `pre-commit install`.
 ### Running Test Cases
 
 ```
-> pytest   # will run all test cases - including ones that require a gpu
-> pytest  -m "not gpu"  # run test cases that can work with just cpu
+> pytest   # run test cases that can work with just cpu
+> pytest  -m ''  # will run all test cases - including ones that require a gpu
+> pytest -m gpu # run only gpu test cases
 ```
 
 
@@ -99,6 +100,30 @@ torchrun --standalone --nproc_per_node=1  ml_mdm/clis/generate_sample.py --port
 
 ## Codebase
 
+
+### 1. /configs
+
+| module | description |
+| - | - |
+| `configs.dataset_creation` | Configuration file for dataset splitting into train-eval-val pipeline |
+| `configs.datasets` | Datasets for training and evaluation phases of the model |
+| `configs.models` | Configuration files for different resolution models |
+
+
+### 2. /data
+
+| module | description |
+| - | - |
+| `data` | <ul><li><b>bert.vocab:</b> BERT-trained dictionary containing tokens and their associated vector representations</li><li><b>c4_wpm.vocab:</b> C4-trained dictionary containing tokens and their associated vector representations</li><li><b>cifar10.vocab:</b> CIFAR10-trained dictionary containing tokens and their associated vector representations</li><li><b>imagenet.vocab:</b> Prompts associated with Imagenet dataset</li><li><b>prompts_cc12m-64x64.tsv:</b> Prompts associated with cc12m dataset for the 64x64 res. model</li><li><b>prompts_cc12m-256x256.tsv:</b> Prompts associated with cc12m dataset for the 256x256 res. model</li><li><b>prompts_cifar10-32x32.tsv:</b> Prompts associated with cifar10 dataset for the 32x32 res. model </li><li><b>prompts_cifar10-64x64.tsv:</b> Prompts associated with cifar10 dataset for the 64x64 res. model </li><li><b>prompts_demo.tsv:</b> Extra demo prompts </li><li><b>prompts_imagenet-64px.tsv:</b> Prompts associated with imagenet dataset for the 64x64 res. model </li><li><b>prompts_WebImage-ALIGN-64px.tsv:</b> Prompts associated with WebImage-ALIGN dataset for the 64x64 res. model </li><li><b>t5.vocab:</b> t5-trained dictionary containing tokens and their associated vector representations </li><li><b>tokenizer_spm_32000_50m.vocab:</b> SPM-trained dictionary containing tokens and their associated vector representations </li></ul> |
+
+### 3. /docs
+
+| module | description |
+| - | - |
+| `docs` | <ul><li><b>web_demo.png:</b> Screenshot of the web demo of the model</li></ul> |
+
+### 4. /ml_mdm 
+
 | module | description |
 | - | - |
 | `ml_mdm.models` | The core model implementations |
@@ -107,7 +132,11 @@ torchrun --standalone --nproc_per_node=1  ml_mdm/clis/generate_sample.py --port
 | `ml_mdm.clis` | All command line tools in the project, the most relevant being `train_parallel.py` |
 | `tests/` | Unit tests and sample training files |
 
+### 5. /tests
 
+| module | description |
+| - | - |
+| `tests.test_files` | Sample files for testing |
 
 # Concepts
 
@@ -125,6 +154,22 @@ In the `ml_mdm.models` submodule, we've open sourced our implementations of:
 > In essence, `simple_parsing` will convert all passed cli arguments and yaml files into clean configuration classes like `ml_mdm.reader.ReaderConfig`, `ml_mdm.diffusion.DiffusionConfig`.
 
 
+`ml_mdm.config` stores a global mapping of names to classes in `MODEL_REGISTRY`, `MODEL_CONFIG_REGISTRY`, `PIPELINE_REGISTRY`, and `PIPELINE_CONFIG_REGISTRY`.
+
+`MODEL_REGISTRY` and `PIPELINE_REGISTRY` store information as shown in the following example:
+
+> *_CONFIG_REGISTRY[architecture name]["model"] = model name
+
+> *_CONFIG_REGISTRY[architecture name]["config"] = configuration class
+
+MODEL_CONFIG_REGISTRY and PIPELINE_CONFIG_REGISTRY store information as shown in the following example: 
+> *_CONFIG_REGISTRY[architecture name]["model"] = model name
+
+> *_CONFIG_REGISTRY[architecture name]["config"] = configuration class
+
+
+architecture name and model name are passed into ml_mdm.config through the function parameter *names. where *names points to "architecture name", "model name"
+
 
 
 # Tutorials
@@ -263,11 +308,11 @@ reader_config:
 Then you can use our dataset download helper:
 ```console
 python -m ml_mdm.clis.download_tar_from_index \
-  --dataset-config-file configs/datasets/cc12m.yaml \
+  --dataset_config_file configs/datasets/cc12m.yaml \
   --subset train --download_tar
 
 python -m ml_mdm.clis.download_tar_from_index \
-  --dataset-config-file configs/datasets/cc12m.yaml \
+  --dataset_config_file configs/datasets/cc12m.yaml \
   --subset eval --download_tar
 ```
 

diff --git a/configs/dataset_creation/sample_cc12m.yaml → ...onfigs/dataset_creation/sample_cc12m.yaml b/configs/dataset_creation/sample_cc12m.yaml → ...onfigs/dataset_creation/sample_cc12m.yaml
diff --git a/configs/datasets/cc12m.yaml → ...dm-matryoshka/configs/datasets/cc12m.yaml b/configs/datasets/cc12m.yaml → ...dm-matryoshka/configs/datasets/cc12m.yaml
diff --git a/configs/models/cc12m_1024x1024.yaml → ...oshka/configs/models/cc12m_1024x1024.yaml b/configs/models/cc12m_1024x1024.yaml → ...oshka/configs/models/cc12m_1024x1024.yaml
diff --git a/configs/models/cc12m_256x256.yaml → ...ryoshka/configs/models/cc12m_256x256.yaml b/configs/models/cc12m_256x256.yaml → ...ryoshka/configs/models/cc12m_256x256.yaml
diff --git a/configs/models/cc12m_64x64.yaml → ...atryoshka/configs/models/cc12m_64x64.yaml b/configs/models/cc12m_64x64.yaml → ...atryoshka/configs/models/cc12m_64x64.yaml
diff --git a/data/bert.vocab → ml-mdm-matryoshka/data/bert.vocab b/data/bert.vocab → ml-mdm-matryoshka/data/bert.vocab
diff --git a/data/c4_wpm.vocab → ml-mdm-matryoshka/data/c4_wpm.vocab b/data/c4_wpm.vocab → ml-mdm-matryoshka/data/c4_wpm.vocab
diff --git a/data/cifar10.vocab → ml-mdm-matryoshka/data/cifar10.vocab b/data/cifar10.vocab → ml-mdm-matryoshka/data/cifar10.vocab
diff --git a/data/imagenet.vocab → ml-mdm-matryoshka/data/imagenet.vocab b/data/imagenet.vocab → ml-mdm-matryoshka/data/imagenet.vocab
diff --git a/data/prompts_WebImage-ALIGN-64px.tsv → ...shka/data/prompts_WebImage-ALIGN-64px.tsv b/data/prompts_WebImage-ALIGN-64px.tsv → ...shka/data/prompts_WebImage-ALIGN-64px.tsv
diff --git a/data/prompts_cc12m-256x256.tsv → ...matryoshka/data/prompts_cc12m-256x256.tsv b/data/prompts_cc12m-256x256.tsv → ...matryoshka/data/prompts_cc12m-256x256.tsv
diff --git a/data/prompts_cc12m-64x64.tsv → ...m-matryoshka/data/prompts_cc12m-64x64.tsv b/data/prompts_cc12m-64x64.tsv → ...m-matryoshka/data/prompts_cc12m-64x64.tsv
diff --git a/data/prompts_cifar10-32x32.tsv → ...matryoshka/data/prompts_cifar10-32x32.tsv b/data/prompts_cifar10-32x32.tsv → ...matryoshka/data/prompts_cifar10-32x32.tsv
diff --git a/data/prompts_cifar10-64x64.tsv → ...matryoshka/data/prompts_cifar10-64x64.tsv b/data/prompts_cifar10-64x64.tsv → ...matryoshka/data/prompts_cifar10-64x64.tsv
diff --git a/data/prompts_demo.tsv → ml-mdm-matryoshka/data/prompts_demo.tsv b/data/prompts_demo.tsv → ml-mdm-matryoshka/data/prompts_demo.tsv
diff --git a/data/prompts_imagenet-64px.tsv → ...matryoshka/data/prompts_imagenet-64px.tsv b/data/prompts_imagenet-64px.tsv → ...matryoshka/data/prompts_imagenet-64px.tsv
diff --git a/data/t5.vocab → ml-mdm-matryoshka/data/t5.vocab b/data/t5.vocab → ml-mdm-matryoshka/data/t5.vocab
diff --git a/data/tokenizer_spm_32000_50m.vocab → ...yoshka/data/tokenizer_spm_32000_50m.vocab b/data/tokenizer_spm_32000_50m.vocab → ...yoshka/data/tokenizer_spm_32000_50m.vocab
diff --git a/ml_mdm/clis/__init__.py → ml-mdm-matryoshka/ml_mdm/clis/__init__.py b/ml_mdm/clis/__init__.py → ml-mdm-matryoshka/ml_mdm/clis/__init__.py
diff --git a/ml_mdm/clis/download_tar_from_index.py → ...ka/ml_mdm/clis/download_tar_from_index.py b/ml_mdm/clis/download_tar_from_index.py → ...ka/ml_mdm/clis/download_tar_from_index.py
@@ -17,7 +17,7 @@
 nodes this data will be distributed over.
 """
 
-import argparse
+import simple_parsing
 import csv
 import logging
 import os
@@ -33,7 +33,30 @@
 import mlx.data
 
 from ml_mdm import helpers, s3_helpers
-
+from dataclasses import dataclass, field
+
+@dataclass
+class DownloadConfig:
+    dataset_config_file: str = field(default="", 
+        metadata={"help": "yaml file with dataset names"})
+    worker_id: int = field(default=0, 
+        metadata={"help": "current worker in [0, num-downloaders -1]"})
+    num_downloaders: int = field(default=1, 
+        metadata={"help": "number of parallel downloaders"})
+    no_bandwidth: bool = field(default=False)
+    download_tar: bool = field(default=False, 
+        metadata={"help": "whether or not to download tar files also"})
+    pretrained_text_embeddings: str = field(default=None)
+    endpoint_url: str = field(default="", 
+        metadata={"help": "end point for the s3 bucket — uses environment variable AWS_ENDPOINT_URL otherwise"})
+    subset: str = field(default="train", 
+        metadata={"choices": ["train", "eval"], 
+                  "help": "subset to download [train|eval]"})
+
+def get_parser():
+    parser = simple_parsing.ArgumentParser(description="Download tar files referred to in index file from mlx")
+    parser.add_arguments(DownloadConfig, dest="options")
+    return parser
 
 def read_tsv(filename):
     # Open the TSV file for reading
@@ -331,44 +354,7 @@ def main(args):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Download tar files referred to in index file from mlx"
-    )
-    parser.add_argument(
-        "--dataset-config-file",
-        type=str,
-        default="",
-        help="yaml file with dataset names",
-    )
-    parser.add_argument(
-        "--worker-id",
-        type=int,
-        default=0,
-        help="current worker in [0, num-downloaders -1]",
-    )
-    parser.add_argument(
-        "--num-downloaders", type=int, default=1, help="number of parallel downloaders"
-    )
-    parser.add_argument("--no_bandwidth", action="store_true")
-    parser.add_argument(
-        "--download_tar",
-        action="store_true",
-        help="whether or not to download tar files also",
-    )
-    parser.add_argument("--pretrained-text-embeddings", type=str, default=None)
-    parser.add_argument(
-        "--endpoint-url",
-        type=str,
-        default="",
-        help="end point for the s3 bucket — uses environment variable AWS_ENDPOINT_URL otherwise",
-    )
-    parser.add_argument(
-        "--subset",
-        type=str,
-        default="train",
-        choices=["train", "eval"],
-        help="subset to download [train|eval]",
-    )
+    parser = get_parser()
     args = parser.parse_args()
     logging.basicConfig(
         level="INFO",
@@ -377,5 +363,5 @@ def main(args):
         ),
         datefmt="%H:%M:%S",
     )
-    helpers.print_args(args)
-    main(args)
+    helpers.print_args(args.options)
+    main(args.options)
diff --git a/ml_mdm/clis/generate_batch.py → ...-matryoshka/ml_mdm/clis/generate_batch.py b/ml_mdm/clis/generate_batch.py → ...-matryoshka/ml_mdm/clis/generate_batch.py
diff --git a/ml_mdm/clis/generate_sample.py → ...matryoshka/ml_mdm/clis/generate_sample.py b/ml_mdm/clis/generate_sample.py → ...matryoshka/ml_mdm/clis/generate_sample.py
@@ -1,11 +1,12 @@
 # For licensing see accompanying LICENSE file.
 # Copyright (C) 2024 Apple Inc. All rights reserved.
+import argparse
 import logging
 import os
 import shlex
 import time
 from dataclasses import dataclass
-from typing import Optional
+from typing import Optional, Tuple
 
 import gradio as gr
 import simple_parsing
@@ -16,6 +17,8 @@
 import torch
 from torchvision.utils import make_grid
 
+import ml_mdm.language_models.factory
+import ml_mdm.language_models.tokenizer
 from ml_mdm import helpers, reader
 from ml_mdm.config import get_arguments, get_model, get_pipeline
 from ml_mdm.language_models import factory
@@ -36,22 +39,28 @@
 )
 
 
-def dividable(n):
+def dividable(n: int) -> Tuple[int, int]:
     for i in range(int(np.sqrt(n)), 0, -1):
         if n % i == 0:
             break
     return i, n // i
 
 
-def generate_lm_outputs(device, sample, tokenizer, language_model, args):
+def generate_lm_outputs(
+    device: torch.device,
+    sample: dict,
+    tokenizer: ml_mdm.language_models.tokenizer.Tokenizer,
+    language_model: ml_mdm.language_models.factory.LanguageModel,
+    args: argparse.Namespace,
+) -> dict:
     with torch.no_grad():
         lm_outputs, lm_mask = language_model(sample, tokenizer)
         sample["lm_outputs"] = lm_outputs
         sample["lm_mask"] = lm_mask
     return sample
 
 
-def setup_models(args, device):
+def setup_models(args: argparse.Namespace, device: torch.device):
     input_channels = 3
 
     # load the language model
@@ -68,7 +77,10 @@ def setup_models(args, device):
     return tokenizer, language_model, diffusion_model
 
 
-def plot_logsnr(logsnrs, total_steps):
+
+def plot_logsnr(logsnrs: list, total_steps: int) -> np.ndarray:
+    import matplotlib
+    matplotlib.use('Agg')
     import matplotlib.pyplot as plt
 
     x = 1 - np.arange(len(logsnrs)) / (total_steps - 1)
@@ -103,39 +115,40 @@ class GLOBAL_DATA:
 global_config = GLOBAL_DATA()
 
 
-def stop_run():
+def stop_run() -> gr.component:
     return (
         gr.update(value="Run", variant="primary", visible=True),
         gr.update(visible=False),
     )
 
 
-def get_model_type(config_file):
+
+def get_model_type(config_file: str) -> str:
     with open(config_file, "r") as f:
         d = yaml.safe_load(f)
         return d.get("model", d.get("vision_model", "unet"))
 
 
 def generate(
-    config_file="cc12m_64x64.yaml",
-    ckpt_name="vis_model_64x64.pth",
-    prompt="a chair",
-    input_template="",
-    negative_prompt="",
-    negative_template="",
-    batch_size=20,
-    guidance_scale=7.5,
-    threshold_function="clip",
-    num_inference_steps=250,
-    eta=0,
-    save_diffusion_path=False,
-    show_diffusion_path=False,
-    show_xt=False,
-    reader_config="",
-    seed=10,
-    comment="",
-    override_args="",
-    output_inner=False,
+    config_file: str = "cc12m_64x64.yaml",
+    ckpt_name: str = "vis_model_64x64.pth",
+    prompt: str = "a chair",
+    input_template: str = "",
+    negative_prompt: str = "",
+    negative_template: str = "",
+    batch_size: int = 20,
+    guidance_scale: float = 7.5,
+    threshold_function: str = "clip",
+    num_inference_steps: int = 250,
+    eta: int = 0,
+    save_diffusion_path: bool = False,
+    show_diffusion_path: bool = False,
+    show_xt: bool = False,
+    reader_config: str = "",
+    seed: int = 10,
+    comment: str = "",
+    override_args: str = "",
+    output_inner: bool = False,
 ):
     np.random.seed(seed)
     torch.random.manual_seed(seed)
@@ -292,7 +305,7 @@ def generate(
                 )
 
 
-def main(args):
+def main(args: argparse.Namespace):
     # get the language model outputs
     example_texts = open("data/prompts_demo.tsv").readlines()