forked from teddyhla/team2hackathon
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexperiments.py
More file actions
127 lines (96 loc) · 3.41 KB
/
experiments.py
File metadata and controls
127 lines (96 loc) · 3.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from chromadb.utils import embedding_functions
from embedding.chroma import ChromaDB
def add_docs_to_db(doctxt, id):
"""_summary_
Args:
doctxt (_type_): _description_
id (_type_): _description_
Raises:
FileNotFoundError: _description_
FileNotFoundError: _description_
Returns:
_type_: _description_
"""
db = ChromaDB()
dev = db.medical_clerking
dev.add(documents=doctxt, ids=id)
print("documents added to database successfully")
return
class Embedder:
def __init__(self):
self.default_ef = embedding_functions.DefaultEmbeddingFunction()
def create_embeddings(self, userinput):
"""create embeddings
Args:
userinput (string): text data
Raises:
FileNotFoundError: _description_
FileNotFoundError: _description_
Returns:
_type_: _description_
"""
input = [userinput]
val = self.default_ef(input)
print(val)
return val
# Example usage:
embedder = Embedder()
eg = "Doctor I have a pneumonia"
eg2 = "oh my leg hurts"
v2 = embedder.create_embeddings(userinput=eg)
v3 = embedder.create_embeddings(userinput=eg2)
# model architecture
# data loader (will load the input data)
# data processor (will clean the loaded data)
# embeddings generator (will then create an embedding)
# feature suggestion (let the user select other llm)
# nearest distance calculator
# feature suggestion (the user can select top 3 vs 5 vs 10)
# serve the nearest distances
# from pathlib import Path
# import pandas as pd
# import torch
# from transformers import AutoModel, AutoTokenizer
# # data loader (will load the input data)
# def dataloader(path):
# """
# load csv file
# """
# try:
# path = Path(path)
# if path.is_dir():
# csv_files = list(path.glob("*.csv"))
# if not csv_files:
# raise FileNotFoundError(f"No csv files found in directory: {path}")
# print(f" Found {len(csv_files)} CSV Files(s) in this directory")
# pd.read_csv(csv_files[0])
# print(f"Successfully loaded: {csv_files[0].name}")
# elif path.suffix == ".csv":
# if not path.exists():
# raise FileNotFoundError()
# print(f"File found at {path}")
# pd.read_csv(path)
# except Exception as e:
# print(f"Error found {e}")
# # data processor (will clean the loaded data)
# data = {
# 'x': ["Doctor, I have a shortness of breath and chest pain and also have temperatures",
# "My leg hurts", "I am feeling dizzy"],
# 'y': ["pneumonia", "dvt", "vertigo"]
# }
# df = pd.DataFrame(data)
# tokenizer = AutoTokenizer.from_pretrained("Charangan/MedBERT")
# model = AutoModel.from_pretrained("Charangan/MedBERT")
# def text_to_embedding(text):
# # Tokenize the text
# inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
# # Get the embeddings
# with torch.no_grad():
# outputs = model(**inputs)
# # The embeddings are in the last hidden state
# embeddings = outputs.last_hidden_state
# # You can use the mean of the embeddings as the representation
# mean_embeddings = embeddings.mean(dim=1)
# return mean_embeddings
# # Apply the function to the 'x' column
# df['x_embeddings'] = df['x'].apply(lambda x: text_to_embedding(x).squeeze().numpy())