team2hackathon/experiments.py at main · NPinn/team2hackathon · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from chromadb.utils import embedding_functions

from embedding.chroma import ChromaDB


def add_docs_to_db(doctxt, id):
    """_summary_

    Args:
        doctxt (_type_): _description_
        id (_type_): _description_

    Raises:
        FileNotFoundError: _description_
        FileNotFoundError: _description_

    Returns:
        _type_: _description_
    """
    db = ChromaDB()
    dev = db.medical_clerking
    dev.add(documents=doctxt, ids=id)
    print("documents added to database successfully")
    return


class Embedder:
    def __init__(self):
        self.default_ef = embedding_functions.DefaultEmbeddingFunction()

    def create_embeddings(self, userinput):
        """create embeddings

        Args:
            userinput (string): text data

        Raises:
            FileNotFoundError: _description_
            FileNotFoundError: _description_

        Returns:
            _type_: _description_
        """
        input = [userinput]
        val = self.default_ef(input)
        print(val)
        return val

# Example usage:


embedder = Embedder()
eg = "Doctor I have a pneumonia"
eg2 = "oh my leg hurts"
v2 = embedder.create_embeddings(userinput=eg)
v3 = embedder.create_embeddings(userinput=eg2)
# model architecture
# data loader (will load the input data)
# data processor (will clean the loaded data)
# embeddings generator (will then create an embedding)
# feature suggestion (let the user select other llm)
# nearest distance calculator
# feature suggestion (the user can select top 3 vs 5 vs 10)
# serve the nearest distances

# from pathlib import Path

# import pandas as pd
# import torch
# from transformers import AutoModel, AutoTokenizer


# # data loader (will load the input data)
# def dataloader(path):
#     """
#     load csv file
#     """

#     try:
#         path = Path(path)

#         if path.is_dir():
#             csv_files = list(path.glob("*.csv"))
#             if not csv_files:
#                 raise FileNotFoundError(f"No csv files found in directory: {path}")
#             print(f" Found {len(csv_files)} CSV Files(s) in this directory")
#             pd.read_csv(csv_files[0])
#             print(f"Successfully loaded: {csv_files[0].name}")

#         elif path.suffix == ".csv":
#             if not path.exists():
#                 raise FileNotFoundError()
#             print(f"File found at {path}")
#             pd.read_csv(path)
#     except Exception as e:
#         print(f"Error found {e}")


# # data processor (will clean the loaded data)

# data = {
#     'x': ["Doctor, I have a shortness of breath and chest pain and also have temperatures",
#           "My leg hurts", "I am feeling dizzy"],
#     'y': ["pneumonia", "dvt", "vertigo"]
# }
# df = pd.DataFrame(data)

# tokenizer = AutoTokenizer.from_pretrained("Charangan/MedBERT")
# model = AutoModel.from_pretrained("Charangan/MedBERT")


# def text_to_embedding(text):
#     # Tokenize the text
#     inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
#     # Get the embeddings
#     with torch.no_grad():
#         outputs = model(**inputs)
#     # The embeddings are in the last hidden state
#     embeddings = outputs.last_hidden_state
#     # You can use the mean of the embeddings as the representation
#     mean_embeddings = embeddings.mean(dim=1)
#     return mean_embeddings

# # Apply the function to the 'x' column


# df['x_embeddings'] = df['x'].apply(lambda x: text_to_embedding(x).squeeze().numpy())