Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 65 additions & 20 deletions gliner/evaluation/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import warnings
from collections import defaultdict
from typing import Union, List, Literal

import pandas as pd
import numpy as np
import torch

Expand All @@ -24,7 +24,7 @@ def _prf_divide(
result = np.true_divide(numerator, denominator)
result[denominator == 0] = 0.0 if zero_division in ["warn", 0] else 1.0

if denominator == 0 and zero_division == "warn" and metric in warn_for:
if np.any(denominator == 0) and zero_division == "warn" and metric in warn_for:
msg_start = f"{metric.title()}"
if "f-score" in warn_for:
msg_start += " and F-score" if metric in warn_for else "F-score"
Expand All @@ -36,7 +36,7 @@ def _prf_divide(
result_size=len(result),
)

return result
return np.round(result,decimals=2)


def _warn_prf(average: str, modifier: str, msg_start: str, result_size: int):
Expand Down Expand Up @@ -84,19 +84,44 @@ def flatten_for_eval(y_true, y_pred):
return all_true, all_pred


def compute_prf(y_true, y_pred, average="micro"):
def compute_prf(y_true:List,
y_pred:List,
):
all_metrics = {}
y_true, y_pred = flatten_for_eval(y_true, y_pred)

pred_sum, tp_sum, true_sum, target_names = extract_tp_actual_correct(y_true, y_pred)

if average == "micro":
tp_sum = np.array([tp_sum.sum()])
pred_sum = np.array([pred_sum.sum()])
true_sum = np.array([true_sum.sum()])

# Calculate macro metric (divide by classes number)
per_class_dico,metrics_macro = _calculate_metrics(tp_sum,pred_sum,true_sum,"macro")
all_metrics["macro"] = metrics_macro

# # Calculate performance per class
all_metrics["per_class"] = {target_names[i]:{"precision":per_class_dico["precision"][i],
"recall":per_class_dico["recall"][i],
"f_score":per_class_dico["f_score"][i]}
for i in range(len(target_names))
}


# Calculate micro metric (divide by all sum-up values of all classes )
tp_sum_micro = np.array([tp_sum.sum()])
pred_sum_micro = np.array([pred_sum.sum()])
true_sum_micro = np.array([true_sum.sum()])

metrics_micro = _calculate_metrics(tp_sum_micro,pred_sum_micro,true_sum_micro,"micro")
all_metrics["micro"] = metrics_micro

return all_metrics

def _calculate_metrics(tp,
pred,
true,
average : Literal["micro","macro"],
):
precision = _prf_divide(
numerator=tp_sum,
denominator=pred_sum,
numerator=tp,
denominator=pred,
metric="precision",
modifier="predicted",
average=average,
Expand All @@ -105,20 +130,30 @@ def compute_prf(y_true, y_pred, average="micro"):
)

recall = _prf_divide(
numerator=tp_sum,
denominator=true_sum,
numerator=tp, # TODO check
denominator=true,
metric="recall",
modifier="true",
average=average,
warn_for=["precision", "recall", "f-score"],
zero_division="warn",
)

denominator = precision + recall
denominator[denominator == 0.0] = 1
f_score = 2 * (precision * recall) / denominator
denominator_fscore = precision + recall
denominator_fscore[denominator_fscore== 0.0] = 1
f_score = np.round(2 * (precision* recall) / denominator_fscore, decimals=2)

if average == "micro" :

return {"precision": precision[0], "recall": recall[0], "f_score": f_score[0]}

else :
per_class_array = {"precision": precision, "recall": recall, "f_score": f_score}
macro_precision = np.round(precision.sum()/len(precision),decimals=2)
macro_recall = np.round(recall.sum()/len(recall),decimals=2)
macro_f_score = np.round(f_score.sum()/ len(f_score), decimals=2)
return per_class_array,{"precision": macro_precision, "recall": macro_recall, "f_score": macro_f_score}

return {"precision": precision[0], "recall": recall[0], "f_score": f_score[0]}


class Evaluator:
Expand Down Expand Up @@ -150,10 +185,20 @@ def transform_data(self):

@torch.no_grad()
def evaluate(self):
"""
output : {

"per_class":{"tag1":{"precision":int, "recall":int,"f_score":int},
"tag2":{}...
},
"micro":{"precision":int, "recall":int,"f_score":int},
"macro":{"precision":int, "recall":int,"f_score":int},
}
"""
all_true_typed, all_outs_typed = self.transform_data()
precision, recall, f1 = compute_prf(all_true_typed, all_outs_typed).values()
output_str = f"P: {precision:.2%}\tR: {recall:.2%}\tF1: {f1:.2%}\n"
return output_str, f1
results_all = compute_prf(all_true_typed, all_outs_typed)
# output_str = f"P: {precision:.2%}\tR: {recall:.2%}\tF1: {f1:.2%}\n"
return results_all


def is_nested(idx1, idx2):
Expand Down
37 changes: 35 additions & 2 deletions gliner/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import re
import warnings
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from typing import Dict, List, Optional, Union
Expand Down Expand Up @@ -652,10 +653,42 @@ def evaluate(
all_trues.extend(batch["entities"])
# Evaluate the predictions
evaluator = Evaluator(all_trues, all_preds)
out, f1 = evaluator.evaluate()
# out, f1 = evaluator.evaluate()

return out, f1
# return out, f1
all_results = evaluator.evaluate()

return all_results

def beautiful_df_print(self, eval_results:Dict):
"""
eval_results :
{

"per_class":{"tag1":{"precision":int, "recall":int,"f_score":int},
"tag2":{}...
},
"micro":{"precision":int, "recall":int,"f_score":int},
"macro":{"precision":int, "recall":int,"f_score":int},
}
"""
# add mico and macro metrics
df_metrics = pd.DataFrame()
df_metrics["MICRO_AVG"] = eval_results["micro"]
df_metrics["MACRO_AVG"] = eval_results["macro"]

# add seperator line
df_metrics = df_metrics.transpose()
df_metrics.loc["-------"]= {'precision': '---', 'recall': '---', 'f_score': '---'}


# add results per class
df_per_class = pd.DataFrame(eval_results["per_class"])
df_per_class = df_per_class.transpose().sort_values(by='f_score',ascending=False)

df = pd.concat([df_metrics,df_per_class])

return df
def encode_labels(self, labels: List[str], batch_size: int = 8) -> torch.FloatTensor:
"""
Embedding of labels.
Expand Down