-
Notifications
You must be signed in to change notification settings - Fork 24
Expand file tree
/
Copy pathfilter_method.py
More file actions
88 lines (77 loc) · 3.93 KB
/
filter_method.py
File metadata and controls
88 lines (77 loc) · 3.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def tfidf_filter(myverbalizer, cc_logits, class_labels):
myrecord = ""
class_num = len(class_labels)
norm_ord = 10/(class_num-2+1e-2) +1
print("norm_ord", norm_ord)
context_size = cc_logits.shape[0]
tobeproject = cc_logits.transpose(0,1).unsqueeze(0)
ret = []
for i in range(tobeproject.shape[-1]):
ret.append(myverbalizer.project(tobeproject[:,:,i]).unsqueeze(-1))
ret = torch.cat(ret, dim=-1)
label_words_cc_logits = ret.squeeze()
label_words_cc_logits = label_words_cc_logits - label_words_cc_logits.mean(dim=-1,keepdims=True)#, dim=-1)
first_label_logits = label_words_cc_logits[:,0,:]
orgshape = label_words_cc_logits.shape
label_words_cc_logits = label_words_cc_logits.reshape(-1,context_size)
sim_mat = cosine_similarity(label_words_cc_logits.cpu().numpy(),first_label_logits.cpu().numpy() ).reshape(*orgshape[:-1],first_label_logits.shape[0])
sim_mat = sim_mat - 10000.0* (1-myverbalizer.label_words_mask.unsqueeze(-1).cpu().numpy())
new_label_words = []
max_lbw_num_pclass = myverbalizer.label_words_mask.shape[-1]
outputers = []
for class_id in range(len(myverbalizer.label_words)):
tfidf_scores = []
tf_scores = []
idf_scores = []
num_words_in_class = len(myverbalizer.label_words[class_id])
for in_class_id in range(max_lbw_num_pclass):
if myverbalizer.label_words_mask[class_id, in_class_id] > 0:
word_sim_scores = sim_mat[class_id, in_class_id]
tf_score = word_sim_scores[class_id]
idf_score_source = np.concatenate([word_sim_scores[:class_id], word_sim_scores[class_id+1:]])
idf_score = 1/ (np.linalg.norm(idf_score_source, ord=norm_ord)/np.power((class_num-1), 1/norm_ord))
tfidf_score = tf_score * idf_score #+1e-15)
if tf_score<0:
tfidf_score = -100
tfidf_scores.append(tfidf_score)
tf_scores.append(tf_score)
idf_scores.append(idf_score)
outputer = list(zip(myverbalizer.label_words[class_id],
tfidf_scores,
tf_scores,
idf_scores))
outputer = sorted(outputer, key=lambda x:-x[1])
outputers.append(outputer)
cut_optimality = []
max_outputer_len = max([len(outputers[class_id]) for class_id in range(len(outputers))])
for cut_potent in range(max_outputer_len):
cut_rate = cut_potent/max_outputer_len
loss = 0
for class_id in range(len(myverbalizer.label_words)):
cut_potent_this_class = int(cut_rate*len(outputers[class_id]))
if len(outputers[class_id]) <= cut_potent_this_class:
boundary_score = outputers[class_id][-1][1]
else:
boundary_score = outputers[class_id][cut_potent_this_class][1]
loss += (boundary_score-1)**2
cut_optimality.append([cut_rate, loss])
optimal_cut_rate = sorted(cut_optimality, key=lambda x:x[1])[0][0]
print("optimal_cut rate is {}".format(optimal_cut_rate))
for class_id in range(len(myverbalizer.label_words)):
cut = int(len(outputers[class_id])*optimal_cut_rate)
if cut==0:
cut=1
# cut = optimal_cut
new_l = [x[0] for x in outputers[class_id][:cut]]
removed_words = [x[0] for x in outputers[class_id][cut:]]
myrecord += f"Class {class_id} {new_l}\n"
myrecord +=f"Class {class_id} rm: {removed_words}\n"
new_label_words.append(new_l)
myverbalizer.label_words = new_label_words
myverbalizer = myverbalizer.cuda()
noww_label_words_num = [len(myverbalizer.label_words[i]) for i in range(len(class_labels))]
myrecord += f"Phase 3 {noww_label_words_num}\n"
return myrecord