diff --git a/firefin/evaluation/academia/AcaEvaluatorModel.py b/firefin/evaluation/academia/AcaEvaluatorModel.py index 4ec679f..4bc39ba 100644 --- a/firefin/evaluation/academia/AcaEvaluatorModel.py +++ b/firefin/evaluation/academia/AcaEvaluatorModel.py @@ -1,6 +1,20 @@ import typing import pandas as pd -from ..eva_utils import compute_ic, summarise_ic, generate_latex_code, ForwardReturns, QuantileReturns +from ..eva_utils import ( + compute_ic, + summarise_ic, + generate_latex_code, + ForwardReturns, + QuantileReturns, + single_sort_table1_latex, + single_sort_table2_latex, + single_sort_table3_latex, + fama_macbeth_latex, + regression_latex, + else1_latex, + else2_latex, + else3_latex +) from ...core.algorithm.regression import least_square, RollingRegressor, BatchRegressionResult from ...common.config import logger from .anomaly_test import AnomalyTest @@ -142,7 +156,6 @@ def run_ic( pd.DataFrame A DataFrame containing IC values for each evaluation period. """ - ic = compute_ic(self.factor, self.forward_returns, method = method) if plot: plots.plt_ic(ic, plot_dir = plot_dir) @@ -270,4 +283,32 @@ def run_all(self) -> dict: results['anomaly_stat'] = {k:self.run_anomaly_test(portfolio_returns= pd.DataFrame(v.iloc[:,-1]), return_stats= True)} logger.info("Anomaly Test Completed") - return results \ No newline at end of file + return results + + # TODO: Complete the comments about df and return + @staticmethod + def output_latex(df_dict: dict): + keys = ['table1', 'table2', 'table3', 'fama_macbeth', 'regression', 'else1', 'else2', 'else3'] + str1 = single_sort_table1_latex(df1 = df_dict['df1'], df2 = df_dict['df2']) + str2 = single_sort_table2_latex(df = df_dict['df3']) + str3 = single_sort_table3_latex(df1 =df_dict['df4'], df2 = df_dict['df5']) + str4 = fama_macbeth_latex( + df1 = df_dict['df6'], + df2 = df_dict['df7'], + df3 = df_dict['df8'], + df4 = df_dict['df9'] + ) + str5 = regression_latex( + df1 = df_dict['df10'], + df2 = df_dict['df11'], + df3 = df_dict['df12'], + df4 = df_dict['df13'] + ) + str6 = else1_latex(df1 = df_dict['df14'], df2 = df_dict['df15']) + str7 = else2_latex(df = df_dict['df16']) + str8 = else3_latex(df = df_dict['df17']) + values = [str1, str2, str3, str4, str5, str6, str7, str8] + + result_dict = dict(zip(keys, values)) + return result_dict + diff --git a/firefin/evaluation/eva_utils.py b/firefin/evaluation/eva_utils.py index d338725..a98ac31 100644 --- a/firefin/evaluation/eva_utils.py +++ b/firefin/evaluation/eva_utils.py @@ -4,9 +4,10 @@ # TODO: Move some common algorithms to fire/core/algorithm/ import typing - +from typing import List, Optional import numpy as np import pandas as pd +import re __all__ = [ "compute_forward_returns", @@ -221,6 +222,732 @@ def generate_latex_code(plot_path: str, summary_table: pd.DataFrame) -> str: return latex_code +def _format_df_cols( + df: pd.DataFrame, + percent_cols: List[int] = None, + bracket_cols: List[int] = None +) -> pd.DataFrame: + """ + Format specific columns in a DataFrame: + - Columns in `percent_cols` will be formatted as percentages (e.g., '12.34%') + - Columns in `bracket_cols` will be formatted as bracketed values (e.g., '(12.34)') + - All other numeric columns will be formatted to 2 decimal places (e.g., '12.34') + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame to format. + percent_cols : list of int, optional + List of column indices to format as plain percentages. + bracket_cols : list of int, optional + List of column indices, formatted as value with brackets. + + Returns + ------- + pd.DataFrame + A new DataFrame with specified columns formatted as strings. + """ + formatted_df = df.copy().astype(object) + + percent_cols = percent_cols or [] + bracket_cols = bracket_cols or [] + all_formatted = set(percent_cols + bracket_cols) + + for col_idx in percent_cols: + formatted_df.iloc[:, col_idx] = formatted_df.iloc[:, col_idx].map(lambda x: f"{x:.2%}") + + for col_idx in bracket_cols: + formatted_df.iloc[:, col_idx] = formatted_df.iloc[:, col_idx].map(lambda x: f"({x:.2})") + + # Format remaining columns to 2 decimal places + for col_idx in range(formatted_df.shape[1]): + if col_idx not in all_formatted: + formatted_df.iloc[:, col_idx] = formatted_df.iloc[:, col_idx].map(lambda x: f"{x:.2f}") + + return formatted_df + +def _interleave_dfs( + df1: pd.DataFrame, + df2: pd.DataFrame, + correspondence: List[List[int]], + interleave_rows: Optional[List[int]] = None +) -> pd.DataFrame: + ''' + Interleave two DataFrames row-wise based on column correspondence. + + This function aligns columns of `df2` to `df1` using the provided correspondence, + then interleaves rows from both DataFrames, inserting `None` for missing indices in `df2`. + + Parameters + ---------- + df1: pd.Dataframe + The primary DataFrame whose structure determines the output format. + Its index and columns are used as the template for alignment. + + df2: pd.Dataframe + The secondary DataFrame whose columns will be aligned to `df1`. + + correspondence: List[List[int]] + A list of column index pairs `[i, j]`, where: + - `i` is the column index in `df1` (and output DataFrame). + - `j` is the column index in `df2` to map to `df1`'s column `i`. + Example: `[[0, 1], [2, 0]]` maps `df2[:, 1]` to `df1[:, 0]`, and `df2[:, 0]` to `df1[:, 2]`. + + interleave_rows : Optional[List[int]], default=None + If provided, specifies which row indices (by position) to interleave from `df1`. + Other rows from `df1` will be appended after interleaving. + + Returns + ------- + pd.DataFrame + Interleaved DataFrame: df1 row, then df2 row (with NaNs except at mapped positions), repeated. + ''' + if interleave_rows is None: + interleave_rows = list(range(df1.shape[0])) + interleave_index = df1.index[interleave_rows] + + # Create aligned version of df2 + df2_aligned = pd.DataFrame( + index = interleave_index, + columns = df1.columns, + dtype = object + ) + for i, j in correspondence: + df2_aligned.iloc[:, i] = df2.iloc[:, j] + + # Add index column for LaTeX use + df1_full = df1.copy() + df1_full.insert( + loc = 0, + column = 'Index', + value = df1.index + ) + df2_full = df2_aligned.copy() + df2_full.insert( + loc = 0, + column = 'Index', + value = [None] * len(interleave_rows) + ) + + # Interleave selected rows + rows = [] + for (_, row1), (_, row2) in zip(df1_full.iloc[interleave_rows].iterrows(), df2_full.iterrows()): + rows.extend([row1, row2]) + interleaved = pd.DataFrame(rows, columns = df1_full.columns) + + # Append non-interleaved rows if any + non_interleave_df = df1_full.drop(interleave_index) + if not non_interleave_df.empty: + interleaved = pd.concat([interleaved, non_interleave_df], axis = 0) + + return interleaved + +# TODO: find a more proper name standing for its function +def single_sort_table1_latex( + df1: pd.DataFrame, + df2: pd.DataFrame, +) -> str: + ''' + Generate a LaTeX-formatted table from two related DataFrames (df1 and df2), + interleaving selected rows with standard errors and returning the final LaTeX string. + Simply print the return value to get reproducible LaTeX code. + + Parameters + ---------- + df1 : pd.DataFrame + A DataFrame containing the main results for each portfolio. + The index should represent portfolio names. + Columns (in order) must be: + 0: Monthly Excess Return + 1: Standard Deviation + 2: Alpha (CAPM) + 3: VWRF (CAPM) + 4: Adj R-squared (CAPM) + 5: Alpha (4-Factor Model) + 6: RMRF (4-Factor Model) + 7: SMB (4-Factor Model) + 8: HML (4-Factor Model) + 9: PR1YR (4-Factor Model) + 10: Adj R-squared (4-Factor Model) + + df2 : pd.DataFrame + A DataFrame containing standard errors corresponding to selected columns from df1. + The index must exactly match df1. + The columns (in order) are: + 0: Std of Alpha (CAPM) + 1: Std of VWRF (CAPM) + 2: Std of Alpha (4-Factor Model) + 3: Std of RMRF (4-Factor Model) + 4: Std of SMB (4-Factor Model) + 5: Std of HML (4-Factor Model) + 6: Std of PR1YR (4-Factor Model) + + Returns + ------- + str + Full LaTeX codes as a single string + ''' + # index = ['1A', '1B', '1C', '1 (high)'] + list(range(2, 10)) + \ + # ['10 (low)', '10A', '10B', '10C', '1-10 spread', '1A-1C spread', '9-10 spread'] + percent_cols = [[0, 1, 2, 5], []] + bracket_cols = [[], list(range(df2.shape[1]))] + mean_std_pairs = [[2, 0], [3, 1], [5, 2], [6, 3], [7, 4], [8, 5], [9, 6]] + + formatted_df1 = _format_df_cols( + df1, + percent_cols[0], + bracket_cols[0] + ) + formatted_df2 = _format_df_cols( + df2, + percent_cols[1], + bracket_cols[1] + ) + + df3 = _interleave_dfs( + formatted_df1, + formatted_df2, + correspondence = mean_std_pairs + ) + + df3_latex_code = df3.to_latex(header = False, index = False, escape = True).replace('NaN', '') + # ['\\begin{tabular}', 'toprule'] have been typeset so deleted + df_latex_code = [ + line for line in df3_latex_code.splitlines() + if not any(i in line for i in ['\\begin{tabular}', 'toprule']) + ] + + latex_code = [ + r'\begin{table}[ht]', + r'\centering', + r'\begin{tabular}{*{12}c}', + r'\toprule', + r' & & & \multicolumn{3}{c}{\multirow{2}{*}{CAPM}} & \multicolumn{6}{c}{\multirow{2}{*}{4-Factor Model}} \\', + r' & Monthly & & \multicolumn{3}{c}{\hrulefill} & \multicolumn{6}{c}{\hrulefill} \\', + r' & Excess & Std & & & Adj & & & & & & Adj \\ ', + r'Portfolio & Return & Dev & Alpha & VWRF & R-sq & Alpha & RMRF & SMB & HML & PR1YR & R-sq \\', + ] + df_latex_code + [ + r'\end{table}' + ] + latex_code = '\n'.join(latex_code) + + return latex_code + +# TODO: find a more proper name standing for its function +def single_sort_table2_latex( + df: pd.DataFrame, +) -> str: + ''' + Generate a LaTeX-formatted table from a single DataFrame. + Simply print the return value to get reproducible LaTeX code. + + Parameters + ---------- + df : pd.DataFrame + A DataFrame containing results for each portfolio. + The index should represent portfolio names. + Columns (in order) must be: + 0: Excess Return + 1: Standard Deviation + 2: Alpha (4-Factor Model Ordinary Least Squares (OLS) Estimates) + 3: Alpha-t (4-Factor Model Ordinary Least Squares (OLS) Estimates) + 4: RMRF (4-Factor Model Ordinary Least Squares (OLS) Estimates) + 5: SMB (4-Factor Model Ordinary Least Squares (OLS) Estimates) + 6: HML (4-Factor Model Ordinary Least Squares (OLS) Estimates) + 7: PR1YR (4-Factor Model Ordinary Least Squares (OLS) Estimates) + 8: Expense Ratio + 9: Turnover (Mturn) + 10: Roundtrip Transaction Costs + 11: Adjusted Alpha + + Returns + ------- + str + Full LaTeX codes as a single string + ''' + # index = ['1 (high)'] + list(range(2, 10)) + ['10 (low)', '1-10 spread', '9-10 spread'] + + percent_cols = [0, 1, 2, 10, 11] + bracket_cols = [3] + + formatted_df = _format_df_cols(df, percent_cols, bracket_cols).to_latex(header = False, escape = True) + + # ['\\begin{tabular}', 'toprule'] have been typeset so deleted + df_latex_code = [ + line for line in formatted_df.splitlines() + if not any(i in line for i in ['\\begin{tabular}', 'toprule']) + ] + + latex_code = [ + r'\begin{table}[ht]', + r'\centering', + r'\begin{tabular}{*{13}c}', + r'\toprule', + r' & & & \multicolumn{6}{c}{\multirow{2}{*}{4-Factor Model Ordinary Least Squares (OLS) Estimates}} & & & Roundtrip \\', + r' & Excess & Standard & \multicolumn{6}{c}{\hrulefill} & Exp & Turn & Transaction & Adjusted \\', + r'Portfolio & Return & Deviation & Alpha & Alpha-t & RMRF & SMB & HML & PR1YR & Ration & (Mturn) & Costs & Alpha \\', + ] + df_latex_code + [ + r'\end{table}' + ] + latex_code = '\n'.join(latex_code) + + return latex_code + +# TODO: find a more proper name standing for its function +def single_sort_table3_latex( + df1: pd.DataFrame, + df2: pd.DataFrame +) -> str: + ''' + Generate a LaTeX-formatted table from two related DataFrames (df1 and df2), + interleaving selected rows with standard errors and returning the final LaTeX string. + Simply print the return value to get reproducible LaTeX code. + + Parameters + ---------- + df1 : pd.DataFrame + A DataFrame containing the main results for each portfolio. + The index should represent portfolio names. + Columns (in order) must be: + 0: P1 (low beta) + 1: P2 + 2: P3 + 3: P4 + 4: P5 + 5: P6 + 6: P7 + 7: P8 + 8: P9 + 9: P10 (high beta) + 10: BAB + + df2 : pd.DataFrame + A DataFrame containing standard errors corresponding to selected columns from df1. + The index must exactly match df1. + The columns (in order) are: + 0: Std of P1 (low beta) + 1: Std of P2 + 2: Std of P3 + 3: Std of P4 + 4: Std of P5 + 5: Std of P6 + 6: Std of P7 + 7: Std of P8 + 8: Std of P9 + 9: Std of P10 (high beta) + 10: Std of BAB + + Returns + ------- + str + Full LaTeX codes as a single string + ''' + # index = ['Excess return', 'CAPM alpha', 'Three-factor alpha', 'Four-factor alpha', 'Five-factor alpha', 'Beta (ex ante)', + # 'Beta (realized)', 'Volatility', 'Sharpe ratio'] + + percent_cols = [[], []] + bracket_cols = [[], list(range(df2.shape[1]))] + mean_std_pairs = [[i, i] for i in range(df2.shape[1])] + + formatted_df1 = _format_df_cols(df1, percent_cols[0], bracket_cols[0]) + formatted_df2 = _format_df_cols(df2, percent_cols[1], bracket_cols[1]) + df3 = _interleave_dfs( + formatted_df1, + formatted_df2, + correspondence = mean_std_pairs, + interleave_rows = list(range(df2.shape[0])) + ) + + df3_latex_code = df3.to_latex(header = False, index = False, escape = True).replace('NaN', '') + # ['\\begin{tabular}', 'toprule'] have been typeset so deleted + df_latex_code = [ + line for line in df3_latex_code.splitlines() + if not any(i in line for i in ['\\begin{tabular}', 'toprule']) + ] + + latex_code = [ + r'\begin{table}[ht]', + r'\centering', + r'\begin{tabular}{*{12}c}', + r'\toprule', + r'Portfolio & P1 & P2 & P3 & P4 & P5 & P6 & P7 & P8 & P9 & P10 & BAB \\', + r'& (low beta) & & & & & & & & & (high beta) & \\', + ] + df_latex_code + [ + r'\end{table}' + ] + latex_code = '\n'.join(latex_code) + + return latex_code + +# TODO: Complete the comments about df and return +# TODO: find a more proper name standing for its function +def fama_macbeth_latex( + df1: pd.DataFrame, + df2: pd.DataFrame, + df3: pd.DataFrame, + df4: pd.DataFrame +) -> str: + ''' + Generate a LaTeX-formatted side-by-side table for Fama-MacBeth regression results. + + This function formats and interleaves two pairs of DataFrames, then generates LaTeX + code to display them in two vertically stacked tables arranged side by side. + + Parameters + ---------- + df1 : pd.DataFrame + The index should represent variable (e.g., 'CGO', 'FROXY', 'PROXY x CGO'). + Columns (in order) must be: + 0: Variable (1) + 1: Variable (2) + 2: Variable (3) + 3: Variable (4) + + df2 : pd.DataFrame + The index must exactly match df1. + The columns (in order) are: + 0: Variable (1) + 1: Variable (2) + 2: Variable (3) + 3: Variable (4) + + df3 : pd.DataFrame + The index should represent variable (e.g., 'CGO', 'FROXY', 'PROXY x CGO'). + Columns (in order) must be: + 0: Variable (1) + 1: Variable (2) + 2: Variable (3) + 3: Variable (4) + + df4 : pd.DataFrame + The index must exactly match df3. + The columns (in order) are: + 0: Variable (1) + 1: Variable (2) + 2: Variable (3) + 3: Variable (4) + + Returns + ------- + str + A string of LaTeX code representing two side-by-side tables + ''' + # index = ['CGO', 'FROXY', 'PROXY x CGO', '\\multirow{2}{*}{\\shortstack[l]{PROXY x \\\\ MOM(-12, -1)}}', + # 'MOM(-1, 0)', 'MOM(-12, -1)', 'TURNOVER'] + percent_cols = [[], []] + bracket_cols = [[], list(range(df2.shape[1]))] + mean_std_pairs = [[i, i] for i in range(df2.shape[1])] + + latex_list = [] + df_dict = {1: df1, 2: df2, 3: df3, 4: df4} + for i in [1, 3]: + formatted1 = _format_df_cols(df_dict[i], percent_cols[0], bracket_cols[0]) + formatted2 = _format_df_cols(df_dict[i + 1], percent_cols[1], bracket_cols[1]) + df_latex_code = _interleave_dfs( + formatted1, + formatted2, + correspondence = mean_std_pairs + ).to_latex(header = False, index = False) + df_latex_code = re.sub('\(?nan\)?', '', df_latex_code, flags = re.IGNORECASE) + # ['\\begin{tabular}', 'toprule'] have been typeset so deleted + df_latex_code = [ + line for line in df_latex_code.splitlines() + if not any(i in line for i in ['\\begin{tabular}', 'toprule']) + ] + latex_list.append(df_latex_code) + + latex_code = [ + r'\begin{table}[ht]', + r'\centering', + r'\begin{minipage}{0.48\textwidth}', + r'\centering', + r'\begin{tabular}{l*{4}c}', + r'\toprule', + r'\toprule', + r'Variable & (1) & (2) & (3) & (4) \\', + ] + latex_list[0] + [ + r'\end{minipage}', + r'\hfill', + r'\begin{minipage}{0.48\textwidth}', + r'\centering', + r'\begin{tabular}{l*{4}c}', + r'\toprule', + r'\toprule', + ] + latex_list[1] + [ + r'\end{minipage}', + r'\end{table}' + ] + latex_code = '\n'.join(latex_code) + + return latex_code + +# TODO: Complete the comments about df and return +# TODO: find a more proper name standing for its function +def regression_latex( + df1: pd.DataFrame, + df2: pd.DataFrame, + df3: pd.DataFrame, + df4: pd.DataFrame +) -> str: + ''' + Generate a LaTeX-formatted table of interleaved regression results. + + This function takes two pairs of regression outputs, interleaves them + row-wise, and formats them into a LaTeX tabular structure with predefined panel headings. + + Parameters + ---------- + df1 : pd.DataFrame + Columns (in order) must be: + 0 and 1: Global equities + 1 and 2: F1 10Y global + 3 and 4: F1 10Y-2Y global + 5 and 6: US Treasuries + 7 and 8: Commodities + + df2 : pd.DataFrame + The index must exactly match df1. + The columns (in order) are: + 0 and 1: Global equities + 1 and 2: F1 10Y global + 3 and 4: F1 10Y-2Y global + 5 and 6: US Treasuries + 7 and 8: Commodities + + df3 : pd.DataFrame + Columns (in order) must be: + 0 and 1: Currencies + 1 and 2: Credits + 3 and 4: Call options + 5 and 6: Put options + 7 and 8: GCF + + df4 : pd.DataFrame + The index must exactly match df1. + The columns (in order) are: + 0 and 1: Currencies + 1 and 2: Credits + 3 and 4: Call options + 5 and 6: Put options + 7 and 8: GCF + + Returns + ------- + str + A string of LaTeX code representing a formatted table + ''' + # index = ['$\\alpha$', 'Passive long', 'Value', 'Momentum', 'TSMOM', '$R^2$', 'IR'] + percent_cols = [[], []] + bracket_cols = [[], list(range(df2.shape[1]))] + mean_std_pairs = [[i, i] for i in range(df2.shape[1])] + + latex_list = [] + df_dict = {1: df1, 2: df2, 3: df3, 4: df4} + for i in [1, 3]: + formatted1 = _format_df_cols(df_dict[i], percent_cols[0], bracket_cols[0]) + formatted2 = _format_df_cols(df_dict[i + 1], percent_cols[1], bracket_cols[1]) + df_latex_code = _interleave_dfs( + formatted1, + formatted2, + correspondence = mean_std_pairs, + interleave_rows = list(range(df2.shape[0])) + ).to_latex(header = False, index = False) + df_latex_code = re.sub('\(?nan\)?', '', df_latex_code, flags = re.IGNORECASE) + # ['\\begin{tabular}', 'toprule'] have been typeset so deleted + df_latex_code = [ + line for line in df_latex_code.splitlines() + if not any(i in line for i in ['\\begin{tabular}', 'toprule', '\\end{tabular}']) + ] + latex_list.append(df_latex_code) + + latex_code = [ + r'\begin{table}[ht]', + r'\centering', + r'\begin{tabular}{*{11}c}', + r'\toprule', + r'& \multicolumn{2}{c}{Global equities} & \multicolumn{2}{c}{F1 10Y global} &' + \ + r'\multicolumn{2}{c}{F1 10Y-2Y global} & \multicolumn{2}{c}{US Treasuries} &' + \ + r'\multicolumn{2}{c}{Commodities} \\' + ] + latex_list[0] + [ + r'& \multicolumn{2}{c}{Currencies} & \multicolumn{2}{c}{Credits} &' + \ + r'\multicolumn{2}{c}{Call options} & \multicolumn{2}{c}{Put options} &' + \ + r'\multicolumn{2}{c}{GCF} \\' + ] + latex_list[1] + [ + r'\end{tabular}', + r'\end{table}' + ] + latex_code = '\n'.join(latex_code) + + return latex_code + +# TODO: Complete the comments about df and return +# TODO: find a more proper name standing for its function +def else1_latex(df1: pd.DataFrame, df2: pd.DataFrame) -> str: + ''' + Generate LaTeX code for a two-panel table. + + This function takes two DataFrames: one for baseline models and one for FS-FMB procedure. + + Parameters + ---------- + df1 : pd.DataFrame + DataFrame representing Panel A: baseline model. + The index should represent serial number. + Columns (in order) must be: + 0: # standing for serial number + 1: Model + 2: Adj.R-squared + 3: alpha + 4: t-stat(alpha) + + df2 : pd.DataFrame + DataFrame representing Panel B: FS-FMB procedure. + The index should represent step number. + Columns (in order) must be: + 0: Step + 1: h_j + 2: Adj.R-squared + 3: alpha + 4: t-stat(alpha) + + Returns + ------- + str + A LaTeX string for rendering a two-panel regression summary table. + ''' + # Model = ['CAPM', 'FF3', 'FF5', 'FF5M'] + # h_j = ['SMB2', 'SMB2*Mom', 'Mom2*RMW', 'Mkt-RF2','Mkt-RF2*RMW', 'Mkt-Rf*SMB', 'HML2*Mkt-RF'] + latex_list = [] + df_dict = {1: df1, 2: df2} + for i in [1, 2]: + df_latex_code = df_dict[i].to_latex(escape = True, header = False, float_format = '%.3f') + # ['\\begin{tabular}', 'toprule'] have been typeset so deleted + df_latex_code = [ + line for line in df_latex_code.splitlines() + if not any(i in line for i in ['\\begin{tabular}', 'toprule', '\\end{tabular}']) + ] + latex_list.append(df_latex_code) + + latex_code = [ + r'\begin{table}[ht]', + r'\centering', + r'\begin{tabular}{clccc}', + r'\toprule', + r'\multicolumn{5}{c}{Panel A: Baseline Models} \\', + r'\midrule', + r'\# & Model & Adj.R-squared & $\alpha$ & t-stat($\alpha$) \\' + ] + latex_list[0] + [ + r'\multicolumn{5}{c}{Panel B: FS-FMB procedure} \\', + r'\midrule', + r'Step & $h_j$ & Adj.R-squared & $\alpha$ & t-stat($\alpha$) \\', + ] + latex_list[1] + [ + r'\end{tabular}', + r'\end{table}' + ] + latex_code = '\n'.join(latex_code) + + return latex_code + +# TODO: Complete the comments about df and return +# TODO: find a more proper name standing for its function +def else2_latex(df: pd.DataFrame) -> str: + ''' + Generate LaTeX code for a regression R-squared comparison table. + + Parameters + ---------- + df : pd.DataFrame + The index should represent various R-square. + Columns (in order) must be: + 0: CAPM + 1: FF3 + 2: FF5 + 3: FF5M + 4: Higher-Order + + Returns + ------- + str + A LaTeX-formatted string representing a two-row table for model R² comparison. + ''' + # ['$R_{train}^2$', '$R_{oos}^2$'] + df_latex_code = df.to_latex(header = False, float_format = '%.3f') + # ['\\begin{tabular}', 'toprule'] have been typeset so deleted + df_latex_code = [ + line for line in df_latex_code.splitlines() + if not any(i in line for i in ['\\begin{tabular}', 'toprule']) + ] + + latex_code = [ + r'\begin{table}[ht]', + r'\centering', + r'\begin{tabular}{*{6}c}', + r'\toprule', + r'& (1) & (2) & (3) & (4) & (5) \\', + r'& CAPM & FF3 & FF5 & FF5M & Higher-Order\\' + ] + df_latex_code + [ + r'\end{table}' + ] + latex_code = '\n'.join(latex_code) + + return latex_code + +# TODO: Complete the comments about df and return +# TODO: find a more proper name standing for its function +def else3_latex( + df: pd.DataFrame, + val1: float, + val2: float + ) -> str: + ''' + Generate LaTeX code for a table showing factors. + + Parameters + ---------- + df : pd.DataFrame + A DataFrame with index representing factor names. + The index should represent factor. + Columns (in order) must be: + 0: Frac Sig 5% (1) + 1: Frac Sig 5% (2) + + val1 : float + + val2 : float + + Returns + ------- + str + A LaTeX-formatted table string. + ''' + # index = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom', 'SMB2', 'SMB2*Mom', + # 'Mom2*RMW', 'Mkt-RF2', 'Mkt-RF2*RMW', 'Mkt-RF*SMB', 'HML*2Mkt-RF'] + df_latex_code = df.to_latex(header = False, float_format = '%.3f') + df_latex_code = re.sub('\(?nan\)?', '', df_latex_code, flags = re.IGNORECASE) + # ['\\begin{tabular}', 'toprule'] have been typeset so deleted + df_latex_code = [ + line for line in df_latex_code.splitlines() + if not any(i in line for i in ['\\begin{tabular}', 'toprule', '\\end{tabular}']) + ] + + latex_code = [ + r'\begin{table}[ht]', + r'\centering', + r'\begin{tabular}{lcc}', + r'\toprule', + r'& (1) & (2) \\', + r'\midrule', + r'Factor & Frac Sig 5\% & Frac Sig 5\% \\' + ] + df_latex_code + [ + f'\\# zoo factors & {val1} & {val2} \\\\', + r'\bottomrule', + r'\end{tabular}', + r'\end{table}', + ] + latex_code = '\n'.join(latex_code) + + return latex_code + def factor_to_quantile(factor: pd.DataFrame, quantiles: int = 5) -> pd.DataFrame: """ Convert factor to quantile row-wise. The result will always have quantile values ranging from `quantiles` down @@ -325,17 +1052,92 @@ def _row_to_quantile(row_p, row_s): return result -def _compute_quantile_df(qt: pd.DataFrame, fr: pd.DataFrame, reindex=True, quantiles: int = 5): +def _compute_quantile_df( + qt: pd.DataFrame, + fr: pd.DataFrame, + reindex = True, + quantiles: int = 5 +) -> pd.DataFrame: + ''' + Compute equal-weighted average forward returns for each quantile group. + + Assumes that `qt` (quantile assignments) and `fr` (forward returns) are aligned + by index and columns — i.e., same dates (index) and same stocks (columns). + + Parameters + ---------- + qt : pd.DataFrame + Quantile assignment for each asset at each time. + Index: time, Columns: stock code, Values: quantile group (int from 1 to `quantiles`) + + fr : pd.DataFrame + Forward returns for each asset at each time. + Index: time, Columns: stock code, Values: future return + + reindex : bool, default True + Whether to ensure the result has columns 1 to `quantiles` (even if some are missing at certain times) + + quantiles : int, default 5 + Number of quantile groups + + Returns + ------- + pd.DataFrame + A time-series DataFrame of average returns for each quantile group. + Index: time, Columns: quantile group (1 ~ `quantiles`) + ''' # assume aligned result = {} for (dt, fr_row), (_, qt_row) in zip(fr.iterrows(), qt.iterrows()): result[dt] = fr_row.groupby(qt_row).mean() result = pd.DataFrame(result).T if reindex: - return result.reindex(columns=np.arange(1, quantiles + 1), copy=False) + return result.reindex(columns = np.arange(1, quantiles + 1), copy = False) return result -def _compute_weighted_quantile_df(qt: pd.DataFrame, fr: pd.DataFrame, wt: pd.DataFrame, reindex= True, quantiles: int = 5): +def _compute_weighted_quantile_df( + qt: pd.DataFrame, + fr: pd.DataFrame, + wt: pd.DataFrame, + reindex = True, + quantiles: int = 5 +) -> pd.DataFrame: + ''' + Compute value-weighted average forward returns for each quantile group. + + This is the weighted version of `_compute_quantile_df`, where the group-wise + mean is computed using market capitalization. + + Assumes that `qt`, `fr`, and `wt` are aligned by index and columns: + i.e., same dates (index) and same stocks (columns). + + Parameters + ---------- + qt : pd.DataFrame + Quantile assignment for each asset at each time. + Index: time, Columns: stock code, Values: quantile group (int from 1 to `quantiles`) + + fr : pd.DataFrame + Forward returns for each asset at each time. + Index: time, Columns: stock code, Values: forward return + + wt : pd.DataFrame + Value weights for each asset at each time (e.g., market capitalization). + Index: time, Columns: stock code, Values: weight + + reindex : bool, default True + Whether to ensure the result has columns 1 to `quantiles` + (even if some quantile groups are missing at some timestamps) + + quantiles : int, default 5 + Number of quantile groups + + Returns + ------- + pd.DataFrame + A time-series DataFrame of value-weighted average returns for each quantile group. + Index: time, Columns: quantile group (1 ~ `quantiles`) + ''' # assume aligned result = {} for (dt, fr_row), (_, qt_row), (_, wt_row) in zip(fr.iterrows(), qt.iterrows(), wt.iterrows()): diff --git a/tests/test_latex.ipynb b/tests/test_latex.ipynb new file mode 100644 index 0000000..a2926d5 --- /dev/null +++ b/tests/test_latex.ipynb @@ -0,0 +1,155 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "14fac73a", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from firefin.evaluation.eva_utils import (\n", + " single_sort_table1_latex, \n", + " single_sort_table2_latex, \n", + " single_sort_table3_latex,\n", + " fama_macbeth_latex,\n", + " regression_latex,\n", + " else1_latex,\n", + " else2_latex,\n", + " else3_latex\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23a62102", + "metadata": {}, + "outputs": [], + "source": [ + "index = ['1A', '1B', '1C', '1 (high)'] + list(range(2, 10)) + \\\n", + " ['10 (low)', '10A', '10B', '10C', '1-10 spread', '1A-1C spread', '9-10 spread']\n", + "df1 = pd.DataFrame(np.random.random((19, 11)), index = index)\n", + "df2 = pd.DataFrame(np.random.random((19, 7)), index = index)\n", + "print(single_sort_table1_latex(df1, df2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff050b05", + "metadata": {}, + "outputs": [], + "source": [ + "index = ['1 (high)'] + list(range(2, 10)) + ['10 (low)', '1-10 spread', '9-10 spread']\n", + "df = pd.DataFrame(np.random.random((12, 12)), index = index)\n", + "print(single_sort_table2_latex(df))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67f94a37", + "metadata": {}, + "outputs": [], + "source": [ + "index = ['Excess return', 'CAPM alpha', 'Three-factor alpha', 'Four-factor alpha', 'Five-factor alpha', 'Beta (ex ante)', \n", + " 'Beta (realized)', 'Volatility', 'Sharpe ratio']\n", + "df1 = pd.DataFrame(np.random.random((9, 11)), index = index)\n", + "df2 = pd.DataFrame(np.random.random((5, 11)), index = index[:5])\n", + "print(single_sort_table3_latex(df1, df2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edd31db3", + "metadata": {}, + "outputs": [], + "source": [ + "index = ['CGO', 'FROXY', 'PROXY x CGO', '\\\\multirow{2}{*}{\\\\shortstack[l]{PROXY x \\\\\\\\ MOM(-12, -1)}}', 'MOM(-1, 0)', 'MOM(-12, -1)', 'TURNOVER']\n", + "df1 = pd.DataFrame(np.random.random((7, 4)), index = index)\n", + "df2 = pd.DataFrame(np.random.random((7, 4)), index = index)\n", + "df3 = pd.DataFrame(np.random.random((7, 4)), index = index)\n", + "df4 = pd.DataFrame(np.random.random((7, 4)), index = index)\n", + "for i in range(4):\n", + " for j in range(i):\n", + " df1.iloc[i, j] = np.nan\n", + " df2.iloc[i, j] = np.nan\n", + " df3.iloc[i, j] = np.nan\n", + " df4.iloc[i, j] = np.nan\n", + "print(fama_macbeth_latex(df1, df2, df3, df4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f29ddab", + "metadata": {}, + "outputs": [], + "source": [ + "index = ['$\\\\alpha$', 'Passive long', 'Value', 'Momentum', 'TSMOM', '$R^2$', 'IR']\n", + "df1 = pd.DataFrame(np.random.random((7, 10)), index = index)\n", + "df2 = pd.DataFrame(np.random.random((5, 10)), index = index[:-2])\n", + "df3 = pd.DataFrame(np.random.random((7, 10)), index = index)\n", + "df4 = pd.DataFrame(np.random.random((5, 10)), index = index[:-2])\n", + "for i in [2, 3, 4]:\n", + " for j in [0, 2, 4, 6, 8]:\n", + " df1.iloc[i, j] = np.nan\n", + " df2.iloc[i, j] = np.nan\n", + " df3.iloc[i, j] = np.nan\n", + " df4.iloc[i, j] = np.nan\n", + "print(regression_latex(df1, df2, df3, df4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ddf4320", + "metadata": {}, + "outputs": [], + "source": [ + "df1 = pd.DataFrame(np.random.random((4, 3)))\n", + "df1.insert(0, 'Model', ['CAPM', 'FF3', 'FF5', 'FF5M'])\n", + "df2 = pd.DataFrame(np.random.random((7, 3)))\n", + "df2.insert(0, 'h_j', ['SMB2', 'SMB2*Mom', 'Mom2*RMW', 'Mkt-RF2','Mkt-RF2*RMW', 'Mkt-Rf*SMB', 'HML2*Mkt-RF'])\n", + "print(else1_latex(df1, df2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65238b50", + "metadata": {}, + "outputs": [], + "source": [ + "index = ['$R_{train}^2$', '$R_{oos}^2$']\n", + "df = pd.DataFrame(np.random.random((2, 5)), index = index)\n", + "print(else2_latex(df))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3485318", + "metadata": {}, + "outputs": [], + "source": [ + "index = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom', 'SMB2', 'SMB2*Mom', 'Mom2*RMW', 'Mkt-RF2', 'Mkt-RF2*RMW', 'Mkt-RF*SMB', 'HML*2Mkt-RF']\n", + "df = pd.DataFrame([\n", + " np.random.random(6),\n", + " np.random.random(13)\n", + "], columns = index).T\n", + "print(else3_latex(df))" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}