modular-lm/clustering.py at main · Strong-AI-Lab/modular-lm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275

import numpy as np
import argparse
import yaml
import tqdm
import os
import datetime
import matplotlib.pyplot as plt

from src.modular_lm.router.loader import load_router
from src.modular_lm.data.dataset import ProxyDataset

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset
from peft import PeftModel

from sklearn.cluster import BisectingKMeans, KMeans
from sklearn.manifold import MDS
import joblib


CLUSTERING_ALGORITHMS = {
    "Bisecting K-Means": BisectingKMeans,
    "K-Means": KMeans,
}


def main():
    parser = argparse.ArgumentParser(
        description="Cluster the embeddings of the samples from the specified dataset generated by the specified model."
    )
    parser.add_argument("model_config")
    parser.add_argument("dataset_config")
    parser.add_argument('--batch_size', type=int, default=8, help='Batch size for the model.')
    parser.add_argument('--router_config', type=str, default=None, help='Path to the router config file. If specified, the clusters will not be recomputed.')
    parser.add_argument('--saved_clusters_path', type=str, default=None, help='Path to the saved clusters. If specified, the clusters will not be recomputed.')
    parser.add_argument('--hidden_level', type=int, default=-1, help='Level of the hidden state to use for the embeddings. If -1, the last hidden state will be used.')
    parser.add_argument('--gpu', type=str, default=None, help='GPU to use. If None, CPU will be used.')
    args = parser.parse_args()

    # Load model config file
    with open(args.model_config, "r") as model_config_file:
        model_config = yaml.safe_load(model_config_file)

    # Load dataset config file
    with open(args.dataset_config, "r") as data_config_file:
        data_config = yaml.safe_load(data_config_file)

    if args.router_config is not None: # saved clusters are from a routing strategy: /!\ `args.saved_clusters_path` should usually be provided
        # Load router config file
        with open(args.router_config, "r") as router_config_file:
            router_config = yaml.safe_load(router_config_file)
    else:
        router_config = None


    # Load evaluation dataset
    if "huggingface" in data_config and data_config["huggingface"]:
        dataset = load_dataset(data_config["dataset_path"], **data_config["dataset_config"])
    elif "evals" in data_config and data_config["evals"]:
        proxy = ProxyDataset(data_config["dataset_path"], **data_config["dataset_config"])
        dataset = Dataset.from_generator(proxy.generator)
    dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
    dataset = dataset["train"]


    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_config["model_path"], **model_config["tokenizer_config"])
    tokenizer.pad_token = tokenizer.bos_token # for some reason, bos_token is needed to avoid getting NaNs, see: https://discuss.huggingface.co/t/llama2-pad-token-for-batched-inference/48020

    model = AutoModelForCausalLM.from_pretrained(model_config["model_path"], **model_config["model_config"])
    if "peft" in model_config:
            model = PeftModel.from_pretrained(model, model_config["peft"])
            model = model.merge_and_unload()
    model.eval()

    if args.gpu is not None:
        device = torch.device(args.gpu)
        model = model.to(device)

    # Compute the embeddings for each sample in the dataset using the loaded model
    if "column_mappings" in data_config:
        if "text_id" in data_config["column_mappings"]:
            text_id = data_config["column_mappings"]["text_id"]
        if "dataset_id" in data_config["column_mappings"]:
            dataset_id = data_config["column_mappings"]["dataset_id"]
    else:
        text_id = "text"
        dataset_id = "dataset"
    embeddings = []
    batch = []
    if dataset_id in dataset[0]:
        gt_datasets = []
        gt_batch = []
    else:
        gt_datasets = None
    for i in tqdm.trange(len(dataset)):
        batch.append(dataset[i][text_id])
        if gt_datasets is not None:
            if "column_mappings" in data_config and "dataset_list" in data_config["column_mappings"]:
                gt_batch.append(data_config["column_mappings"]["dataset_list"].index(dataset[i][dataset_id]))
            else:
                gt_batch.append(int(dataset[i][dataset_id]))

        if len(batch) == args.batch_size:
            input_ids = tokenizer(batch, return_tensors='pt', padding="max_length", truncation=True, max_length=model_config["max_length"])
            if args.gpu is not None:
                input_ids = {key: value.to(device) for key, value in input_ids.items()}
            batch = []

            outputs = model(**input_ids, output_hidden_states=True)
            outputs = outputs.hidden_states[args.hidden_level].detach()
            if args.gpu is not None:
                outputs = outputs.cpu()
            outputs = outputs.to(dtype=torch.float32).numpy()

            if np.isnan(outputs).any():
                print("NaN encountered in the embeddings. Skipping this batch. This can happen when the batch size is too large.")
            else:
                if np.isinf(outputs).any():
                    print("Inf encountered in the embeddings. Truncating.")
                    np.clip(outputs, -1e4, 1e4, out=outputs)
                elif np.abs(outputs).max() > 1e4:
                    print("Large values encountered in the embeddings. Truncating.")
                    np.clip(outputs, -1e4, 1e4, out=outputs)

                embeddings.extend(outputs)
                gt_datasets.extend(gt_batch)
                gt_batch = []

    embeddings = np.array(embeddings)

    if router_config is None or router_config['router_name'].startswith("input-"):
        mds_embeddings = embeddings.sum(axis=1) # sum the embeddings of each token in the input to get sentence embeddings [B x D]
        granularity = "sentence"
    else:
        embeddings = np.moveaxis(embeddings, 1, 0) # [B x L x D] -> [L x B x D]
        embeddings = np.random.permutation(embeddings)[:1,...] # shuffle and remove B(L-1) tokens to get [1 x B x D] token embeddings
        embeddings = np.moveaxis(embeddings, 1, 0) # [1 x B x D] -> [B x 1 x D]

        mds_embeddings = embeddings.reshape(-1, embeddings.shape[-1]) # flatten the embeddings to get a token embeddings [B x D]
        granularity = "token"


    # If ground truth datasets are provided, plot the embeddings with different colors for each dataset
    if gt_datasets is not None:
        project = MDS(n_components=2)
        projection = project.fit_transform(mds_embeddings)

        fig, axs = plt.subplots(1, 1, figsize=(9, 8))
        axs.scatter(projection[:, 0], projection[:, 1], s=10, c=gt_datasets)
        axs.set_title(f"Ground truth datasets")

        # Save clustering results
        save_folder = f"cluster_results/{model_config['model_name']}_{data_config['dataset_name']}_{granularity}"
        save_time = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        os.makedirs(save_folder, exist_ok=True)
        plt.savefig(f"{save_folder}/{save_time}_hl={str(args.hidden_level)}_gt_datasets.png")


    # Run the sklearn k-means algorithm on the embeddings
    if router_config is not None: # saved clusters are from a routing strategy: /!\ `args.saved_clusters_path` should usually be provided

        algo = load_router(router_config['router_path'], router_config['routing_strategy'], args.saved_clusters_path)

        if router_config['router_name'].endswith("-cluster"): # save from sklearn clustering algorithm
            algo.clustering_algorithm.cluster_centers_ = algo.clustering_algorithm.cluster_centers_.astype(np.float64)
            labels, _ = algo(torch.as_tensor(embeddings,dtype=torch.float64))

            if router_config['router_name'].startswith("token-"):
                labels = labels.reshape(-1, labels.shape[-1])

            labels = labels.argmax(-1).detach().cpu().numpy()
            # centers = algo.clustering_algorithm.cluster_centers_

            # if "mds" in router_config['router_name']:
            #     center_embeddings = np.concatenate((algo.reduction_training_memory, torch.as_tensor(mds_embeddings, dtype=torch.float32)), axis=0)
            #     center_embeddings = algo.dim_reduction.fit_transform(center_embeddings.astype(np.float64))[algo.reduction_training_memory_size:]
            # else:
            #     center_embeddings = mds_embeddings
        elif router_config['router_name'].endswith("-quantizer"): # save from torch routing model
            labels, _ = algo(torch.as_tensor(embeddings, dtype=torch.float32))

            if router_config['router_name'].startswith("token-"):
                labels = labels.reshape(-1, labels.shape[1])

            labels = labels.argmax(-1).detach().cpu().numpy()
            # centers = algo.embedding.weight.detach().cpu().numpy()

            # if "reduced" in router_config['router_name']:
            #     center_embeddings = algo.projector(torch.as_tensor(mds_embeddings, dtype=torch.float32)).detach().cpu().numpy()
            # else:
            #     center_embeddings = mds_embeddings
        else:
            raise ValueError(f"Unknown router name: {router_config['router_name']}")

        project = MDS(n_components=2)
        projection = project.fit_transform(mds_embeddings)
        # centers = project.fit_transform(np.concatenate((centers, center_embeddings)))[-len(centers):] # /!\ this is an approximation of the position of the centers in the projected data space. there are no transform method for MDS. for more information, see: https://github.com/scikit-learn/scikit-learn/pull/16088 https://github.com/scikit-learn/scikit-learn/issues/15808

        # create single plot figure
        fig, axs = plt.subplots(1, 1, figsize=(9, 8))
        axs.scatter(projection[:, 0], projection[:, 1], s=10, c=labels)
        # axs.scatter(centers[:, 0], centers[:, 1], c="r", s=20)
        axs.set_title(f"{router_config['router_name']} : {router_config['routing_strategy']['num_embeddings']} clusters")


        # Compute alignment between the clusters and the ground truth datasets
        if gt_datasets is not None:
            gt_values = set(gt_datasets)
            algo_values = set(labels)

            alignment_table = np.zeros((len(gt_values),len(algo_values)))
            for i in range(len(gt_datasets)):
                alignment_table[gt_datasets[i],labels[i]] += 1
            alignment_table /= alignment_table.sum(axis=1, keepdims=True)


    else: # no saves or custom save from previous clustering.py run
        n_mds = 2
        cluster_lists = [4, 8, 16]

        project = MDS(n_components=n_mds)
        projection = project.fit_transform(mds_embeddings)

        fig, axs = plt.subplots(
            2 * len(CLUSTERING_ALGORITHMS), len(cluster_lists), figsize=(24, 20)
        )
        axs = axs.T
        algos = []
        for k, data in enumerate([projection, mds_embeddings]):
            for i, (algorithm_name, Algorithm) in enumerate(CLUSTERING_ALGORITHMS.items()):
                for j, n_clusters in enumerate(cluster_lists):
                            if args.saved_clusters_path is None:
                                algo = Algorithm(n_clusters=n_clusters, n_init=3)
                                algo.fit(data)
                                algos.append((algo, algorithm_name, n_clusters, n_mds if k == 0 else 0))
                                labels = algo.labels_
                            else:
                                algo = joblib.load(os.path.join(args.saved_clusters_path, f"{algorithm_name}_{n_clusters}_mds={n_mds if k == 0 else 0}.sav"))
                                labels = algo.predict(data)

                            centers = algo.cluster_centers_
                            if data is mds_embeddings:
                                centers = project.fit_transform(np.concatenate((centers, mds_embeddings)))[-len(centers):]  # /!\ this is an approximation of the position of the centers in the projected data space. there are no `transform` method for MDS. for more information, see: https://github.com/scikit-learn/scikit-learn/pull/16088 https://github.com/scikit-learn/scikit-learn/issues/15808

                            axs[j, i + k * len(CLUSTERING_ALGORITHMS)].scatter(projection[:, 0], projection[:, 1], s=10, c=labels)
                            axs[j, i + k * len(CLUSTERING_ALGORITHMS)].scatter(centers[:, 0], centers[:, 1], c="r", s=20)
                            axs[j, i + k * len(CLUSTERING_ALGORITHMS)].set_title(f"{algorithm_name} : {n_clusters} clusters {'(MDS={})'.format(n_mds) if k == 0 else ''}")


    # Save clustering results
    save_folder = f"cluster_results/{model_config['model_name']}_{data_config['dataset_name']}_{granularity}"
    save_time = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    os.makedirs(save_folder, exist_ok=True)
    plt.savefig(f"{save_folder}/{save_time}_hl={str(args.hidden_level)}.png")

    if gt_datasets is not None and router_config is not None:
        print(f"Alignment table:\n{alignment_table}")
        np.save(f"{save_folder}/{save_time}_hl={str(args.hidden_level)}_gt_datasets_alignment_table.npy", alignment_table)


    # Save clustering models
    if args.saved_clusters_path is None:
        os.makedirs(f"{save_folder}/{save_time}_hl={str(args.hidden_level)}", exist_ok=True)
        for algo, name, n_clusters, n_mds in algos:
            joblib.dump(algo, f"{save_folder}/{save_time}_hl={str(args.hidden_level)}/{name}_{n_clusters}_mds={n_mds}.sav")


if __name__ == "__main__":
    main()