-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathoffline_policy_evaluation_quickstart.py
More file actions
60 lines (49 loc) · 1.8 KB
/
offline_policy_evaluation_quickstart.py
File metadata and controls
60 lines (49 loc) · 1.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""Offline policy evaluation for an adaptive-learning policy.
Run with:
PYTHONPATH=src python examples/offline_policy_evaluation_quickstart.py
"""
from __future__ import annotations
import json
import pandas as pd
from orchid_ranker.ope import compare_logged_policies, deterministic_policy_probabilities
def build_logged_events() -> pd.DataFrame:
rows = []
actions = ["review", "stretch"]
for learner_id in range(60):
needs_stretch = learner_id % 3 != 0
target_action = "stretch" if needs_stretch else "review"
for action in actions:
reward = 1.0 if action == target_action else 0.0
rows.append(
{
"learner_id": learner_id,
"action": action,
"reward": reward,
"logging_propensity": 0.5,
"target_action": target_action,
"target_value": 1.0,
"baseline_value": 0.5,
"logged_action_value": reward,
}
)
return pd.DataFrame(rows)
def main() -> None:
events = build_logged_events()
events["target_probability"] = deterministic_policy_probabilities(
events["action"].tolist(),
events["target_action"].tolist(),
)
events["baseline_probability"] = 0.5
report = compare_logged_policies(
events,
reward_col="reward",
propensity_col="logging_propensity",
target_probability_col="target_probability",
baseline_probability_col="baseline_probability",
target_value_col="target_value",
baseline_value_col="baseline_value",
logged_action_value_col="logged_action_value",
)
print(json.dumps(report.to_dict(), indent=2, sort_keys=True))
if __name__ == "__main__":
main()