-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodel.py
More file actions
117 lines (97 loc) · 4.29 KB
/
model.py
File metadata and controls
117 lines (97 loc) · 4.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import joblib
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from feature_engineering import features_to_scale, cols_to_minmax
from feature_engineering import X_train, y_train, X_test, y_test, test_df
param_grid = {
"LR":{
'C': [0.001, 0.005, 0.01],
'solver': ['lbfgs']
}}
lr_model = {"LR" :LogisticRegression(class_weight={0: 1.2, 1: 1.0})}
tscv = TimeSeriesSplit(n_splits=5)
best_models = {}
for name, model in lr_model.items():
print(f"--- Running Grid Search for {name} ---")
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid[name],
cv=tscv, # This tells Scikit-Learn to use TimeSeriesSplit
scoring='f1',
n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_models[name] = grid_search.best_estimator_
print(f"Best Params for {name}: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_:.4f}\n")
# thresholds = [0.50, 0.52, 0.54, 0.56, 0.58, 0.60, 0.62]
thresholds = [0.35, 0.40, 0.45, 0.48, 0.50, 0.55]
probs = best_models['LR'].predict_proba(X_test)[:, 1]
plt.hist(probs, bins=50)
plt.title("Probability Distribution")
plt.show()
def get_comprehensive_metrics(model, X_test, y_test, real_returns, cost_bps=20, threshold=0.50):
# 1. Get Predictions (0 or 1)
probs = model.predict_proba(X_test)[:, 1]
preds = (probs >= threshold).astype(int)
cost_pct = cost_bps /10000
# 2. STATISTICAL METRICS (Use y_test)
precision = precision_score(y_test, preds, zero_division=0)
recall = recall_score(y_test, preds, zero_division=0)
# 3. FINANCIAL METRICS (Use real_returns)
strat_returns = (preds * real_returns) - (preds * cost_pct)
downside_std = strat_returns[strat_returns < 0].std()
sortino = (strat_returns.mean() / downside_std) * np.sqrt(252) if downside_std != 0 else 0
# Equity Curve
daily_portfolio_returns = strat_returns.groupby(level='Date').mean()
equity_curve = (1 + daily_portfolio_returns).cumprod()
# Profit Factor (Sum of Gains / Sum of Losses)
gains = strat_returns[strat_returns > 0].sum()
losses = abs(strat_returns[strat_returns < 0].sum())
profit_factor = gains / losses if losses != 0 else 0
# Max Drawdown
running_max = equity_curve.cummax()
drawdown = (equity_curve - running_max) / running_max
mdd = drawdown.min()
# Calmar Ratio (Annual Return / Max Drawdown)
annual_return = strat_returns.mean() * 252
calmar = annual_return / abs(mdd) if mdd != 0 else 0
total_trades = preds.sum()
# Sharpe Ratio
sharpe = (strat_returns.mean() / strat_returns.std()) * np.sqrt(252) if strat_returns.std() != 0 else 0
return {
"Trades": total_trades,
"Precision": round(precision, 3),
"Recall": round(recall, 3),
"Sharpe": round(sharpe, 2),
"Sortino": round(sortino, 2),
"Profit Factor": round(profit_factor, 2),
"Calmar": round(calmar, 2),
"MDD": round(mdd, 4),
"EquityCurve": equity_curve
}
plt.figure(figsize=(10, 6))
print(f"{'Threshold':<10} | {'Trades':<8} | {'Precision':<10} | {'Sharpe':<8} | {'Profit Factor':<10} | {'Calmar':<8} | {'Precision':<8} | {'Recall':<8} | {'MDD':<8} | {'Sortino':<8}")
print("-" * 120)
for t in thresholds:
m = get_comprehensive_metrics(best_models['LR'], X_test, y_test, test_df['next_day_return'], t)
print(f"{t:<10} | {m['Trades']:<8} | {m['Precision']:<10} | {m['Sharpe']:<8} | {m['Profit Factor']:<10} | {m['Calmar']:<8} | {m['Precision']:<8} | {m['Recall']:<8} | {m['MDD']:<8} | {m['Sortino']:<8}")
# Plot each threshold to see which one survives the 10bps cost
plt.plot(m['EquityCurve'], label=f"Threshold {t}")
plt.legend()
plt.title("LR Equity Curve: Impact of Thresholds & 10bps Costs")
plt.grid(True, alpha=0.3)
plt.show()
# Check which features the LR model is actually using
importance = pd.DataFrame({
'Feature': features_to_scale + cols_to_minmax,
'Weight': best_models['LR'].coef_[0]
}).sort_values(by='Weight', ascending=False)
print(importance)
joblib.dump(best_models['LR'], 'vector_alpha_lr_model.pkl')
# To load it back later:
# model = joblib.load('vector_alpha_lr_model.pkl')