Vector-Alpha/model.py at master · davex-ai/Vector-Alpha · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import joblib
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

from feature_engineering import features_to_scale, cols_to_minmax
from feature_engineering import X_train, y_train,  X_test, y_test,  test_df

param_grid = {
    "LR":{
    'C': [0.001, 0.005, 0.01],
    'solver': ['lbfgs']
}}

lr_model = {"LR" :LogisticRegression(class_weight={0: 1.2, 1: 1.0})}
tscv = TimeSeriesSplit(n_splits=5)
best_models = {}
for name, model in lr_model.items():
    print(f"--- Running Grid Search for {name} ---")
    grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid[name],
    cv=tscv,           # This tells Scikit-Learn to use TimeSeriesSplit
    scoring='f1',
    n_jobs=-1
    )
    grid_search.fit(X_train, y_train)

    best_models[name] = grid_search.best_estimator_
    print(f"Best Params for {name}: {grid_search.best_params_}")
    print(f"Best Score: {grid_search.best_score_:.4f}\n")

# thresholds = [0.50, 0.52, 0.54, 0.56, 0.58, 0.60, 0.62]
thresholds = [0.35, 0.40, 0.45, 0.48, 0.50, 0.55]
probs = best_models['LR'].predict_proba(X_test)[:, 1]
plt.hist(probs, bins=50)
plt.title("Probability Distribution")
plt.show()
def get_comprehensive_metrics(model, X_test, y_test, real_returns, cost_bps=20, threshold=0.50):
    # 1. Get Predictions (0 or 1)
    probs = model.predict_proba(X_test)[:, 1]
    preds = (probs >= threshold).astype(int)

    cost_pct = cost_bps /10000

    # 2. STATISTICAL METRICS (Use y_test)
    precision = precision_score(y_test, preds, zero_division=0)
    recall = recall_score(y_test, preds, zero_division=0)

    # 3. FINANCIAL METRICS (Use real_returns)
    strat_returns = (preds * real_returns) - (preds * cost_pct)
    downside_std = strat_returns[strat_returns < 0].std()
    sortino = (strat_returns.mean() / downside_std) * np.sqrt(252) if downside_std != 0 else 0

    # Equity Curve
    daily_portfolio_returns = strat_returns.groupby(level='Date').mean()
    equity_curve = (1 + daily_portfolio_returns).cumprod()

    # Profit Factor (Sum of Gains / Sum of Losses)
    gains = strat_returns[strat_returns > 0].sum()
    losses = abs(strat_returns[strat_returns < 0].sum())
    profit_factor = gains / losses if losses != 0 else 0

    # Max Drawdown
    running_max = equity_curve.cummax()
    drawdown = (equity_curve - running_max) / running_max
    mdd = drawdown.min()

    # Calmar Ratio (Annual Return / Max Drawdown)
    annual_return = strat_returns.mean() * 252
    calmar = annual_return / abs(mdd) if mdd != 0 else 0
    total_trades = preds.sum()

    # Sharpe Ratio
    sharpe = (strat_returns.mean() / strat_returns.std()) * np.sqrt(252) if strat_returns.std() != 0 else 0

    return {
        "Trades": total_trades,
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "Sharpe": round(sharpe, 2),
        "Sortino": round(sortino, 2),
        "Profit Factor": round(profit_factor, 2),
        "Calmar": round(calmar, 2),
        "MDD": round(mdd, 4),
        "EquityCurve": equity_curve
    }
plt.figure(figsize=(10, 6))

print(f"{'Threshold':<10} | {'Trades':<8} | {'Precision':<10} | {'Sharpe':<8} | {'Profit Factor':<10} | {'Calmar':<8} | {'Precision':<8} | {'Recall':<8} | {'MDD':<8} | {'Sortino':<8}")
print("-" * 120)
for t in thresholds:
    m = get_comprehensive_metrics(best_models['LR'], X_test, y_test,  test_df['next_day_return'], t)
    print(f"{t:<10} | {m['Trades']:<8} | {m['Precision']:<10} | {m['Sharpe']:<8} | {m['Profit Factor']:<10} | {m['Calmar']:<8} | {m['Precision']:<8} | {m['Recall']:<8} | {m['MDD']:<8} | {m['Sortino']:<8}")

    # Plot each threshold to see which one survives the 10bps cost
    plt.plot(m['EquityCurve'], label=f"Threshold {t}")

plt.legend()
plt.title("LR Equity Curve: Impact of Thresholds & 10bps Costs")
plt.grid(True, alpha=0.3)
plt.show()

# Check which features the LR model is actually using
importance = pd.DataFrame({
    'Feature': features_to_scale + cols_to_minmax,
    'Weight': best_models['LR'].coef_[0]
}).sort_values(by='Weight', ascending=False)
print(importance)

joblib.dump(best_models['LR'], 'vector_alpha_lr_model.pkl')

# To load it back later:
# model = joblib.load('vector_alpha_lr_model.pkl')