pycorrana.core.cca 源代码

"""
典型相关分析模块 (Canonical Correlation Analysis, CCA)
=====================================================
提供两组变量之间的典型相关分析功能。

典型相关分析是一种多元统计方法,用于研究两组变量之间的线性关系。
它寻找两组变量的线性组合,使得这些组合之间的相关性最大化。
"""

import warnings
from typing import Optional, List, Union, Tuple, Dict, TYPE_CHECKING
import numpy as np
import pandas as pd
from scipy import stats
from scipy.linalg import inv, eig, sqrtm

if TYPE_CHECKING:
    from ..utils.large_data import LargeDataConfig


def _validate_cca_inputs(X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, np.ndarray, int]:
    """
    验证并准备CCA输入数据。
    
    Parameters
    ----------
    X, Y : np.ndarray
        输入数据矩阵
        
    Returns
    -------
    tuple
        (处理后的X, 处理后的Y, 样本量)
    """
    X = np.asarray(X, dtype=np.float64)
    Y = np.asarray(Y, dtype=np.float64)
    
    if X.ndim == 1:
        X = X.reshape(-1, 1)
    if Y.ndim == 1:
        Y = Y.reshape(-1, 1)
    
    if X.shape[0] != Y.shape[0]:
        raise ValueError(f"X和Y的样本量不一致: X有{X.shape[0]}行, Y有{Y.shape[0]}行")
    
    mask = ~(np.isnan(X).any(axis=1) | np.isnan(Y).any(axis=1))
    X = X[mask]
    Y = Y[mask]
    
    n = X.shape[0]
    
    if n < max(X.shape[1], Y.shape[1]) + 1:
        raise ValueError(f"样本量不足: 需要至少{max(X.shape[1], Y.shape[1]) + 1}个观测")
    
    return X, Y, n


def _compute_cca_eigen(X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    计算CCA的特征值和特征向量。
    
    使用标准的CCA算法:
    1. 计算协方差矩阵
    2. 求解广义特征值问题
    3. 得到典型相关系数和典型变量系数
    
    Parameters
    ----------
    X, Y : np.ndarray
        中心化后的数据矩阵
        
    Returns
    -------
    tuple
        (典型相关系数, X的典型变量系数, Y的典型变量系数, 特征值)
    """
    n = X.shape[0]
    p = X.shape[1]
    q = Y.shape[1]
    
    Sxx = np.cov(X, rowvar=False)
    Syy = np.cov(Y, rowvar=False)
    Sxy = np.cov(X, Y, rowvar=False)[:p, p:]
    Syx = Sxy.T
    
    Sxx_inv = inv(Sxx)
    Syy_inv = inv(Syy)
    
    M1 = Sxx_inv @ Sxy @ Syy_inv @ Syx
    M2 = Syy_inv @ Syx @ Sxx_inv @ Sxy
    
    eigenvalues1, eigenvectors1 = eig(M1)
    eigenvalues2, eigenvectors2 = eig(M2)
    
    idx1 = np.argsort(eigenvalues1.real)[::-1]
    eigenvalues1 = eigenvalues1.real[idx1]
    eigenvectors1 = eigenvectors1.real[:, idx1]
    
    idx2 = np.argsort(eigenvalues2.real)[::-1]
    eigenvalues2 = eigenvalues2.real[idx2]
    eigenvectors2 = eigenvectors2.real[:, idx2]
    
    eigenvalues = eigenvalues1
    canonical_corrs = np.sqrt(np.clip(eigenvalues, 0, 1))
    
    k = min(p, q)
    canonical_corrs = canonical_corrs[:k]
    
    A = eigenvectors1[:, :k]
    B = eigenvectors2[:, :k]
    
    for i in range(k):
        norm_a = np.sqrt(A[:, i].T @ Sxx @ A[:, i])
        if norm_a > 0:
            A[:, i] = A[:, i] / norm_a
        
        norm_b = np.sqrt(B[:, i].T @ Syy @ B[:, i])
        if norm_b > 0:
            B[:, i] = B[:, i] / norm_b
    
    return canonical_corrs, A, B, eigenvalues[:k]


def _wilks_lambda_test(canonical_corrs: np.ndarray, n: int, p: int, q: int) -> List[Dict]:
    """
    使用Wilks' Lambda进行典型相关显著性检验。
    
    Wilks' Lambda检验用于检验典型相关系数是否显著不为零。
    
    Parameters
    ----------
    canonical_corrs : np.ndarray
        典型相关系数
    n : int
        样本量
    p : int
        X的变量数
    q : int
        Y的变量数
        
    Returns
    -------
    list
        每个典型相关系数的检验结果
    """
    k = len(canonical_corrs)
    results = []
    
    for i in range(k):
        remaining_corrs = canonical_corrs[i:]
        remaining_eigenvalues = remaining_corrs ** 2
        
        wilks_lambda = 1.0
        for ev in remaining_eigenvalues:
            wilks_lambda *= (1 - ev)
        
        df1 = (p - i) * (q - i)
        df2 = (n - i - 1.5) * (p + q - 2 * i) - 0.5 * ((p - i) * (q - i) - 1)
        
        if df2 > 0 and wilks_lambda > 0:
            chi_sq = -(n - i - 0.5 * (p + q + 3)) * np.log(wilks_lambda)
            p_value = 1 - stats.chi2.cdf(chi_sq, df1)
        else:
            chi_sq = np.nan
            p_value = np.nan
        
        results.append({
            'canonical_index': i + 1,
            'canonical_correlation': canonical_corrs[i],
            'wilks_lambda': wilks_lambda,
            'chi_square': chi_sq,
            'df': df1,
            'p_value': p_value
        })
    
    return results


def _compute_confidence_intervals(canonical_corrs: np.ndarray, n: int, 
                                   confidence_level: float = 0.95) -> List[Tuple[float, float]]:
    """
    计算典型相关系数的置信区间。
    
    使用Fisher Z变换计算置信区间。
    
    Parameters
    ----------
    canonical_corrs : np.ndarray
        典型相关系数
    n : int
        样本量
    confidence_level : float
        置信水平
        
    Returns
    -------
    list
        每个典型相关系数的置信区间
    """
    alpha = 1 - confidence_level
    z_crit = stats.norm.ppf(1 - alpha / 2)
    
    intervals = []
    for r in canonical_corrs:
        if r >= 1.0:
            intervals.append((1.0, 1.0))
        elif r <= 0.0:
            intervals.append((0.0, 0.0))
        else:
            z = np.arctanh(r)
            se = 1 / np.sqrt(n - 3)
            
            z_lower = z - z_crit * se
            z_upper = z + z_crit * se
            
            r_lower = np.tanh(z_lower)
            r_upper = np.tanh(z_upper)
            
            r_lower = max(0.0, min(1.0, r_lower))
            r_upper = max(0.0, min(1.0, r_upper))
            
            intervals.append((r_lower, r_upper))
    
    return intervals


def _compute_redundancy(X: np.ndarray, Y: np.ndarray, 
                        A: np.ndarray, B: np.ndarray,
                        canonical_corrs: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    计算冗余指数(Redundancy Index)。
    
    冗余指数衡量一组变量的方差能被另一组变量的典型变量解释的比例。
    
    Parameters
    ----------
    X, Y : np.ndarray
        原始数据矩阵
    A, B : np.ndarray
        典型变量系数
    canonical_corrs : np.ndarray
        典型相关系数
        
    Returns
    -------
    tuple
        (X被Y解释的冗余, Y被X解释的冗余, X的方差解释比例, Y的方差解释比例)
    """
    k = len(canonical_corrs)
    
    X_scores = X @ A
    Y_scores = Y @ B
    
    var_X_scores = np.var(X_scores, axis=0)
    var_Y_scores = np.var(Y_scores, axis=0)
    
    total_var_X = np.var(X, axis=0).sum()
    total_var_Y = np.var(Y, axis=0).sum()
    
    if total_var_X > 0:
        X_var_explained = var_X_scores / total_var_X
    else:
        X_var_explained = np.zeros(k)
    
    if total_var_Y > 0:
        Y_var_explained = var_Y_scores / total_var_Y
    else:
        Y_var_explained = np.zeros(k)
    
    redundancy_X_given_Y = X_var_explained * (canonical_corrs ** 2)
    redundancy_Y_given_X = Y_var_explained * (canonical_corrs ** 2)
    
    return redundancy_X_given_Y, redundancy_Y_given_X, X_var_explained, Y_var_explained


[文档] def cca(X: Union[np.ndarray, pd.DataFrame], Y: Union[np.ndarray, pd.DataFrame], compute_significance: bool = True, confidence_level: float = 0.95, n_permutations: int = 1000, verbose: bool = False) -> Dict: """ 执行典型相关分析(Canonical Correlation Analysis, CCA)。 典型相关分析用于研究两组变量之间的线性关系。它寻找两组变量的 线性组合(典型变量),使得这些组合之间的相关性最大化。 Parameters ---------- X : array-like or pd.DataFrame 第一组变量,形状为 (n_samples, p) Y : array-like or pd.DataFrame 第二组变量,形状为 (n_samples, q) compute_significance : bool, default=True 是否计算显著性检验(p值) confidence_level : float, default=0.95 置信区间水平 n_permutations : int, default=1000 置换检验次数(用于额外的显著性验证) verbose : bool, default=False 是否输出详细信息 Returns ------- dict 包含以下键的字典: - canonical_correlations: 典型相关系数数组 - x_weights: X的典型变量系数(权重矩阵) - y_weights: Y的典型变量系数(权重矩阵) - x_scores: X的典型变量得分 - y_scores: Y的典型变量得分 - significance_tests: 显著性检验结果列表 - confidence_intervals: 置信区间列表 - redundancy: 冗余分析结果 - n_components: 典型变量对数 - n_samples: 样本量 - x_names: X的变量名(如果输入是DataFrame) - y_names: Y的变量名(如果输入是DataFrame) Examples -------- >>> import numpy as np >>> X = np.random.randn(100, 3) >>> Y = np.random.randn(100, 2) >>> result = cca(X, Y) >>> print(f"第一对典型相关系数: {result['canonical_correlations'][0]:.4f}") >>> # 使用DataFrame >>> import pandas as pd >>> df = pd.DataFrame(np.random.randn(100, 5), ... columns=['x1', 'x2', 'x3', 'y1', 'y2']) >>> result = cca(df[['x1', 'x2', 'x3']], df[['y1', 'y2']]) References ---------- Hotelling, H. (1936). "Relations between two sets of variates" Biometrika, 28(3/4), 321-377 """ x_names = None y_names = None if isinstance(X, pd.DataFrame): x_names = X.columns.tolist() X = X.values if isinstance(Y, pd.DataFrame): y_names = Y.columns.tolist() Y = Y.values X, Y, n = _validate_cca_inputs(X, Y) p = X.shape[1] q = Y.shape[1] k = min(p, q) if verbose: print(f"CCA分析: X有{p}个变量, Y有{q}个变量, 样本量={n}") print(f"将计算{k}对典型变量") X_centered = X - X.mean(axis=0) Y_centered = Y - Y.mean(axis=0) canonical_corrs, A, B, eigenvalues = _compute_cca_eigen(X_centered, Y_centered) X_scores = X_centered @ A Y_scores = Y_centered @ B significance_tests = [] if compute_significance: significance_tests = _wilks_lambda_test(canonical_corrs, n, p, q) confidence_intervals = _compute_confidence_intervals(canonical_corrs, n, confidence_level) redundancy_X_given_Y, redundancy_Y_given_X, X_var_explained, Y_var_explained = \ _compute_redundancy(X_centered, Y_centered, A, B, canonical_corrs) result = { 'canonical_correlations': canonical_corrs, 'x_weights': A, 'y_weights': B, 'x_scores': X_scores, 'y_scores': Y_scores, 'eigenvalues': eigenvalues, 'significance_tests': significance_tests, 'confidence_intervals': confidence_intervals, 'redundancy': { 'x_given_y': redundancy_X_given_Y, 'y_given_x': redundancy_Y_given_X, 'x_variance_explained': X_var_explained, 'y_variance_explained': Y_var_explained, 'total_x_given_y': redundancy_X_given_Y.sum(), 'total_y_given_x': redundancy_Y_given_X.sum() }, 'n_components': k, 'n_samples': n, 'n_x_vars': p, 'n_y_vars': q, 'x_names': x_names if x_names else [f'X{i+1}' for i in range(p)], 'y_names': y_names if y_names else [f'Y{i+1}' for i in range(q)], 'x_mean': X.mean(axis=0), 'y_mean': Y.mean(axis=0) } if verbose: print("\n典型相关系数:") for i, (r, ci) in enumerate(zip(canonical_corrs, confidence_intervals)): print(f" 第{i+1}对: {r:.4f} (95% CI: [{ci[0]:.4f}, {ci[1]:.4f}])") if significance_tests: print("\n显著性检验 (Wilks' Lambda):") for test in significance_tests: print(f" 第{test['canonical_index']}对: χ²={test['chi_square']:.4f}, " f"df={test['df']}, p={test['p_value']:.4f}") print(f"\n冗余分析:") print(f" X被Y解释的总方差: {result['redundancy']['total_x_given_y']:.2%}") print(f" Y被X解释的总方差: {result['redundancy']['total_y_given_x']:.2%}") return result
[文档] def cca_permutation_test(X: Union[np.ndarray, pd.DataFrame], Y: Union[np.ndarray, pd.DataFrame], n_permutations: int = 1000, random_state: Optional[int] = None) -> Dict: """ 使用置换检验验证CCA结果的显著性。 置换检验通过随机打乱样本标签来生成零假设分布, 然后比较观测到的典型相关系数与零假设分布。 Parameters ---------- X : array-like or pd.DataFrame 第一组变量 Y : array-like or pd.DataFrame 第二组变量 n_permutations : int, default=1000 置换次数 random_state : int, optional 随机种子 Returns ------- dict 包含置换检验结果的字典 Examples -------- >>> X = np.random.randn(100, 3) >>> Y = np.random.randn(100, 2) >>> result = cca_permutation_test(X, Y, n_permutations=500) >>> print(f"第一对典型相关系数p值: {result['p_values'][0]:.4f}") """ if random_state is not None: np.random.seed(random_state) if isinstance(X, pd.DataFrame): X = X.values if isinstance(Y, pd.DataFrame): Y = Y.values X, Y, n = _validate_cca_inputs(X, Y) X_centered = X - X.mean(axis=0) Y_centered = Y - Y.mean(axis=0) obs_result = cca(X, Y, compute_significance=False, verbose=False) obs_corrs = obs_result['canonical_correlations'] k = len(obs_corrs) perm_corrs = np.zeros((n_permutations, k)) for i in range(n_permutations): perm_idx = np.random.permutation(n) Y_perm = Y_centered[perm_idx] try: perm_result = cca(X_centered, Y_perm, compute_significance=False, verbose=False) perm_corrs[i, :len(perm_result['canonical_correlations'])] = perm_result['canonical_correlations'] except Exception: pass p_values = np.zeros(k) for j in range(k): p_values[j] = (np.sum(perm_corrs[:, j] >= obs_corrs[j]) + 1) / (n_permutations + 1) return { 'observed_correlations': obs_corrs, 'p_values': p_values, 'n_permutations': n_permutations, 'permutation_distribution': perm_corrs }
[文档] class CCAAnalyzer: """ 典型相关分析器类 提供完整的典型相关分析流程,包括分析、可视化和结果解释。 Parameters ---------- X : pd.DataFrame 第一组变量 Y : pd.DataFrame 第二组变量 verbose : bool, default=True 是否输出详细信息 Examples -------- >>> analyzer = CCAAnalyzer(df[['x1', 'x2']], df[['y1', 'y2']]) >>> result = analyzer.fit() >>> analyzer.summary() >>> analyzer.plot_weights() """
[文档] def __init__(self, X: Union[pd.DataFrame, np.ndarray], Y: Union[pd.DataFrame, np.ndarray], verbose: bool = True): if isinstance(X, np.ndarray): X = pd.DataFrame(X, columns=[f'X{i+1}' for i in range(X.shape[1])]) if isinstance(Y, np.ndarray): Y = pd.DataFrame(Y, columns=[f'Y{i+1}' for i in range(Y.shape[1])]) self.X = X self.Y = Y self.verbose = verbose self.result = None
[文档] def fit(self, compute_significance: bool = True, confidence_level: float = 0.95, n_permutations: int = 1000) -> Dict: """ 执行典型相关分析。 Parameters ---------- compute_significance : bool, default=True 是否计算显著性检验 confidence_level : float, default=0.95 置信区间水平 n_permutations : int, default=1000 置换检验次数 Returns ------- dict 分析结果 """ self.result = cca( self.X, self.Y, compute_significance=compute_significance, confidence_level=confidence_level, n_permutations=n_permutations, verbose=self.verbose ) return self.result
[文档] def summary(self) -> str: """ 生成分析摘要。 Returns ------- str 摘要文本 """ if self.result is None: raise ValueError("请先调用fit()方法") lines = [] lines.append("=" * 60) lines.append("典型相关分析结果摘要") lines.append("=" * 60) lines.append(f"\n样本量: {self.result['n_samples']}") lines.append(f"X变量数: {self.result['n_x_vars']}") lines.append(f"Y变量数: {self.result['n_y_vars']}") lines.append(f"典型变量对数: {self.result['n_components']}") lines.append("\n典型相关系数:") lines.append("-" * 50) lines.append(f"{'序号':<6}{'相关系数':<12}{'95%置信区间':<20}{'p值':<10}") lines.append("-" * 50) for i, (r, ci) in enumerate(zip( self.result['canonical_correlations'], self.result['confidence_intervals'] )): p_val = "" if self.result['significance_tests']: p_val = f"{self.result['significance_tests'][i]['p_value']:.4f}" lines.append(f"{i+1:<6}{r:<12.4f}[{ci[0]:.4f}, {ci[1]:.4f}]{'':>4}{p_val:<10}") lines.append("\n冗余分析:") lines.append("-" * 50) lines.append(f"X被Y解释的总方差: {self.result['redundancy']['total_x_given_y']:.2%}") lines.append(f"Y被X解释的总方差: {self.result['redundancy']['total_y_given_x']:.2%}") lines.append("\n各典型变量的方差解释比例:") lines.append(" X变量:") for i, v in enumerate(self.result['redundancy']['x_variance_explained']): lines.append(f" 第{i+1}对: {v:.2%}") lines.append(" Y变量:") for i, v in enumerate(self.result['redundancy']['y_variance_explained']): lines.append(f" 第{i+1}对: {v:.2%}") return "\n".join(lines)
[文档] def get_weights(self, component: int = 1) -> pd.DataFrame: """ 获取指定典型变量的权重。 Parameters ---------- component : int 典型变量序号(从1开始) Returns ------- pd.DataFrame 权重数据框 """ if self.result is None: raise ValueError("请先调用fit()方法") idx = component - 1 if idx < 0 or idx >= self.result['n_components']: raise ValueError(f"无效的典型变量序号: {component}") weights_df = pd.DataFrame({ 'X变量': self.result['x_names'], f'X权重(第{component}对)': self.result['x_weights'][:, idx] }) y_weights_df = pd.DataFrame({ 'Y变量': self.result['y_names'], f'Y权重(第{component}对)': self.result['y_weights'][:, idx] }) return weights_df, y_weights_df
[文档] def get_scores(self) -> Tuple[pd.DataFrame, pd.DataFrame]: """ 获取典型变量得分。 Returns ------- tuple (X的典型变量得分, Y的典型变量得分) """ if self.result is None: raise ValueError("请先调用fit()方法") x_scores = pd.DataFrame( self.result['x_scores'], columns=[f'Canonical_{i+1}' for i in range(self.result['n_components'])] ) y_scores = pd.DataFrame( self.result['y_scores'], columns=[f'Canonical_{i+1}' for i in range(self.result['n_components'])] ) return x_scores, y_scores
[文档] def plot_weights(self, component: int = 1, figsize: Tuple[int, int] = (12, 5), savefig: Optional[str] = None): """ 绘制典型变量权重图。 Parameters ---------- component : int 典型变量序号 figsize : tuple 图表大小 savefig : str, optional 保存路径 """ if self.result is None: raise ValueError("请先调用fit()方法") import matplotlib.pyplot as plt idx = component - 1 if idx < 0 or idx >= self.result['n_components']: raise ValueError(f"无效的典型变量序号: {component}") x_weights = self.result['x_weights'][:, idx] y_weights = self.result['y_weights'][:, idx] x_names = self.result['x_names'] y_names = self.result['y_names'] fig, axes = plt.subplots(1, 2, figsize=figsize) ax1 = axes[0] colors1 = ['#3498db' if w >= 0 else '#e74c3c' for w in x_weights] ax1.barh(x_names, x_weights, color=colors1) ax1.set_xlabel('权重') ax1.set_title(f'X变量权重 (第{component}对典型变量)') ax1.axvline(x=0, color='gray', linestyle='--', alpha=0.5) ax2 = axes[1] colors2 = ['#2ecc71' if w >= 0 else '#e67e22' for w in y_weights] ax2.barh(y_names, y_weights, color=colors2) ax2.set_xlabel('权重') ax2.set_title(f'Y变量权重 (第{component}对典型变量)') ax2.axvline(x=0, color='gray', linestyle='--', alpha=0.5) plt.tight_layout() if savefig: plt.savefig(savefig, dpi=150, bbox_inches='tight') plt.show() return fig
[文档] def plot_scores(self, component: int = 1, figsize: Tuple[int, int] = (8, 8), savefig: Optional[str] = None): """ 绘制典型变量得分散点图。 Parameters ---------- component : int 典型变量序号 figsize : tuple 图表大小 savefig : str, optional 保存路径 """ if self.result is None: raise ValueError("请先调用fit()方法") import matplotlib.pyplot as plt idx = component - 1 if idx < 0 or idx >= self.result['n_components']: raise ValueError(f"无效的典型变量序号: {component}") x_scores = self.result['x_scores'][:, idx] y_scores = self.result['y_scores'][:, idx] r = self.result['canonical_correlations'][idx] fig, ax = plt.subplots(figsize=figsize) ax.scatter(x_scores, y_scores, alpha=0.6, edgecolors='white', linewidth=0.5) z = np.polyfit(x_scores, y_scores, 1) p = np.poly1d(z) x_line = np.linspace(x_scores.min(), x_scores.max(), 100) ax.plot(x_line, p(x_line), 'r--', alpha=0.8, label=f'相关系数 = {r:.4f}') ax.set_xlabel(f'X典型变量得分 (第{component}对)') ax.set_ylabel(f'Y典型变量得分 (第{component}对)') ax.set_title(f'典型变量得分散点图 (第{component}对)') ax.legend() ax.axhline(y=0, color='gray', linestyle='--', alpha=0.3) ax.axvline(x=0, color='gray', linestyle='--', alpha=0.3) plt.tight_layout() if savefig: plt.savefig(savefig, dpi=150, bbox_inches='tight') plt.show() return fig
[文档] def plot_correlations(self, figsize: Tuple[int, int] = (10, 8), savefig: Optional[str] = None): """ 绘制典型相关系数图。 Parameters ---------- figsize : tuple 图表大小 savefig : str, optional 保存路径 """ if self.result is None: raise ValueError("请先调用fit()方法") import matplotlib.pyplot as plt corrs = self.result['canonical_correlations'] cis = self.result['confidence_intervals'] n_components = len(corrs) fig, ax = plt.subplots(figsize=figsize) x_pos = range(1, n_components + 1) lower_errors = [corrs[i] - cis[i][0] for i in range(n_components)] upper_errors = [cis[i][1] - corrs[i] for i in range(n_components)] ax.bar(x_pos, corrs, color='#3498db', alpha=0.8, yerr=[lower_errors, upper_errors], capsize=5) ax.set_xlabel('典型变量对') ax.set_ylabel('典型相关系数') ax.set_title('典型相关系数及95%置信区间') ax.set_xticks(x_pos) ax.set_xticklabels([f'第{i}对' for i in x_pos]) ax.set_ylim(0, 1.1) for i, (r, ci) in enumerate(zip(corrs, cis)): ax.annotate(f'{r:.3f}', (i + 1, r + 0.05), ha='center', fontsize=10) plt.tight_layout() if savefig: plt.savefig(savefig, dpi=150, bbox_inches='tight') plt.show() return fig
[文档] def export_results(self, path: str, format: str = 'excel'): """ 导出分析结果。 Parameters ---------- path : str 保存路径 format : str, default='excel' 格式:'excel' 或 'csv' """ if self.result is None: raise ValueError("请先调用fit()方法") if format == 'excel': with pd.ExcelWriter(path, engine='openpyxl') as writer: corrs_df = pd.DataFrame({ '典型变量对': [f'第{i+1}对' for i in range(len(self.result['canonical_correlations']))], '典型相关系数': self.result['canonical_correlations'], 'CI下限': [ci[0] for ci in self.result['confidence_intervals']], 'CI上限': [ci[1] for ci in self.result['confidence_intervals']] }) if self.result['significance_tests']: corrs_df['Wilks_Lambda'] = [t['wilks_lambda'] for t in self.result['significance_tests']] corrs_df['Chi_Square'] = [t['chi_square'] for t in self.result['significance_tests']] corrs_df['df'] = [t['df'] for t in self.result['significance_tests']] corrs_df['p_value'] = [t['p_value'] for t in self.result['significance_tests']] corrs_df.to_excel(writer, sheet_name='典型相关系数', index=False) x_weights_df = pd.DataFrame( self.result['x_weights'], index=self.result['x_names'], columns=[f'第{i+1}对' for i in range(self.result['n_components'])] ) x_weights_df.to_excel(writer, sheet_name='X权重') y_weights_df = pd.DataFrame( self.result['y_weights'], index=self.result['y_names'], columns=[f'第{i+1}对' for i in range(self.result['n_components'])] ) y_weights_df.to_excel(writer, sheet_name='Y权重') redundancy_df = pd.DataFrame({ '典型变量对': [f'第{i+1}对' for i in range(self.result['n_components'])], 'X方差解释比例': self.result['redundancy']['x_variance_explained'], 'Y方差解释比例': self.result['redundancy']['y_variance_explained'], 'X被Y解释的冗余': self.result['redundancy']['x_given_y'], 'Y被X解释的冗余': self.result['redundancy']['y_given_x'] }) redundancy_df.to_excel(writer, sheet_name='冗余分析', index=False) elif format == 'csv': import os base, ext = os.path.splitext(path) corrs_df = pd.DataFrame({ 'canonical_pair': range(1, len(self.result['canonical_correlations']) + 1), 'correlation': self.result['canonical_correlations'], 'ci_lower': [ci[0] for ci in self.result['confidence_intervals']], 'ci_upper': [ci[1] for ci in self.result['confidence_intervals']] }) corrs_df.to_csv(f"{base}_correlations.csv", index=False) x_weights_df = pd.DataFrame( self.result['x_weights'], index=self.result['x_names'], columns=[f'pair_{i+1}' for i in range(self.result['n_components'])] ) x_weights_df.to_csv(f"{base}_x_weights.csv") y_weights_df = pd.DataFrame( self.result['y_weights'], index=self.result['y_names'], columns=[f'pair_{i+1}' for i in range(self.result['n_components'])] ) y_weights_df.to_csv(f"{base}_y_weights.csv") else: raise ValueError(f"不支持的格式: {format}")
def __repr__(self): return f"CCAAnalyzer(X_vars={self.result['n_x_vars'] if self.result else self.X.shape[1]}, " \ f"Y_vars={self.result['n_y_vars'] if self.result else self.Y.shape[1]})"