示例代码

本节提供 PyCorrAna 的各种使用示例。

基础示例

快速分析 CSV 文件

from pycorrana import quick_corr

result = quick_corr('sales_data.csv')

print(result['significant_pairs'][:5])

分析 Excel 数据

from pycorrana import quick_corr

result = quick_corr(
    'data.xlsx',
    target='revenue',
    export='correlation_results.xlsx'
)

指定分析方法

from pycorrana import CorrAnalyzer

analyzer = CorrAnalyzer(
    df,
    method='spearman',
    missing_strategy='fill',
    fill_method='median'
)

result = analyzer.fit()
analyzer.plot_heatmap()

数据分析流程

完整分析流程

import pandas as pd
from pycorrana import CorrAnalyzer

df = pd.read_csv('data.csv')

analyzer = CorrAnalyzer(df, verbose=True)

analyzer.preprocess()

analyzer.compute_correlation(target='target_column')

result = {
    'correlation_matrix': analyzer.corr_matrix,
    'pvalue_matrix': analyzer.pvalue_matrix,
    'significant_pairs': analyzer.significant_pairs
}

analyzer.plot_heatmap(figsize=(14, 12), cluster=True)

analyzer.export_results('results.xlsx')

分步分析

from pycorrana import CorrAnalyzer
from pycorrana.utils.data_utils import infer_types, handle_missing

type_mapping = infer_types(df)
print("数据类型:", type_mapping)

df_clean = handle_missing(df, strategy='fill', fill_method='mean')

analyzer = CorrAnalyzer(df_clean, method='auto')

result = analyzer.fit(columns=['var1', 'var2', 'var3', 'target'])

print(analyzer.summary())

可视化示例

自定义热力图

from pycorrana import CorrAnalyzer

analyzer = CorrAnalyzer(df)
analyzer.fit()

analyzer.plot_heatmap(
    figsize=(16, 14),
    annot=True,
    fmt='.2f',
    cmap='coolwarm',
    center=0,
    vmin=-1,
    vmax=1,
    linewidths=0.5,
    cluster=True,
    cluster_method='average',
    savefig='heatmap.png',
    dpi=300
)

散点图矩阵

analyzer.plot_pairplot(
    columns=['age', 'income', 'education', 'score'],
    hue='gender',
    diag_kind='kde',
    corner=True,
    savefig='pairplot.png'
)

分组箱线图

analyzer.plot_boxplot(
    numeric_col='salary',
    categorical_col='department',
    kind='violin',
    savefig='salary_by_dept.png'
)

相关网络图

analyzer.visualizer.plot_correlation_network(
    analyzer.corr_matrix,
    threshold=0.4,
    node_size=1000,
    layout='circular',
    savefig='network.png'
)

显著相关对条形图

analyzer.visualizer.plot_significant_pairs(
    analyzer.significant_pairs,
    top_n=15,
    savefig='top_correlations.png'
)

偏相关分析示例

控制单个协变量

from pycorrana import partial_corr

r, p = partial_corr(
    df,
    x='income',
    y='happiness',
    covars='age'
)

print(f"偏相关系数: {r:.4f}, p值: {p:.4f}")

控制多个协变量

from pycorrana import partial_corr

covars = ['age', 'education', 'gender', 'location']

r, p = partial_corr(
    df,
    x='income',
    y='happiness',
    covars=covars
)

偏相关矩阵

from pycorrana import partial_corr_matrix

matrix = partial_corr_matrix(
    df,
    covars=['age'],
    columns=['income', 'health', 'happiness', 'social']
)

print(matrix)

使用 PartialCorrAnalyzer

from pycorrana import PartialCorrAnalyzer

analyzer = PartialCorrAnalyzer(df, covars=['age', 'education'])

result = analyzer.fit(x='income', y='happiness')

matrix = analyzer.compute_matrix(columns=['income', 'health', 'happiness'])

非线性分析示例

距离相关

from pycorrana import distance_correlation
import numpy as np

x = np.random.randn(100)
y = x ** 2 + np.random.randn(100) * 0.1

dcor = distance_correlation(x, y)
print(f"距离相关系数: {dcor:.4f}")

互信息分析

from pycorrana import mutual_info_score

mi = mutual_info_score(df['feature1'], df['feature2'])
print(f"互信息: {mi:.4f}")

最大信息系数

from pycorrana import maximal_information_coefficient

mic = maximal_information_coefficient(df['x'], df['y'])
print(f"MIC: {mic['mic']:.4f}")

备注

性能说明:当前 MIC 实现为纯 Python 版本,计算速度较慢。对于大数据集,建议先采样:

from pycorrana.utils import smart_sample

# 采样后再计算 MIC
sampled_df = smart_sample(df, sample_size=500)
mic = maximal_information_coefficient(sampled_df['x'], sampled_df['y'])

非线性依赖报告

from pycorrana import nonlinear_dependency_report

report = nonlinear_dependency_report(
    df,
    top_n=20,
    methods=['dcor', 'mic']
)

print(report)

使用 NonlinearAnalyzer

from pycorrana import NonlinearAnalyzer

analyzer = NonlinearAnalyzer(df)

result = analyzer.analyze_all(top_n=10)

analyzer.plot_nonlinear_pairs(savefig='nonlinear.png')

示例数据集使用

鸢尾花数据集

from pycorrana import load_iris, quick_corr

iris = load_iris()

result = quick_corr(iris, target='species')

泰坦尼克数据集

from pycorrana import load_titanic, CorrAnalyzer

titanic = load_titanic()

analyzer = CorrAnalyzer(
    titanic,
    missing_strategy='fill',
    fill_method='median'
)

result = analyzer.fit(target='survived')

葡萄酒数据集

from pycorrana import load_wine, quick_corr

wine = load_wine()

result = quick_corr(wine, plot=True)

生成模拟数据

from pycorrana import make_correlated_data, CorrAnalyzer

df = make_correlated_data(
    n_samples=500,
    n_features=8,
    correlation_strength=0.6,
    noise_level=0.2
)

analyzer = CorrAnalyzer(df)
result = analyzer.fit()
analyzer.plot_heatmap(cluster=True)

典型相关分析示例

基本 CCA 分析

from pycorrana import cca, load_iris

df = load_iris()

# 定义两组变量
X = df[['sepal_length', 'sepal_width']]
Y = df[['petal_length', 'petal_width']]

# 执行典型相关分析
result = cca(X, Y)

print("典型相关系数:", result['canonical_correlations'])
# 输出: [0.9409, 0.1222]

查看详细结果

result = cca(X, Y)

# 典型相关系数
print("典型相关系数:")
for i, r in enumerate(result['canonical_correlations']):
    print(f"  第 {i+1} 对: {r:.4f}")

# X 变量的典型系数
print("\nX 变量典型系数:")
print(result['x_weights'])

# Y 变量的典型系数
print("\nY 变量典型系数:")
print(result['y_weights'])

# 显著性检验
print("\n显著性检验:")
for test in result['significance_tests']:
    print(f"  典型相关 {test['canonical_index'] + 1}: "
          f"Wilks' λ = {test['wilks_lambda']:.4f}, "
          f"p = {test['p_value']:.4f}")

置换检验

from pycorrana import cca_permutation_test

result = cca_permutation_test(
    X, Y,
    n_permutations=1000,
    random_state=42
)

print("原始典型相关系数:", result['canonical_correlations'])
print("置换检验 p 值:", result['permutation_pvalues'])

使用 CCAAnalyzer 类

from pycorrana import CCAAnalyzer

analyzer = CCAAnalyzer()
result = analyzer.fit(X, Y)

# 获取典型变量得分
scores_x, scores_y = analyzer.get_scores(X, Y)

# 典型变量相关性
print("典型变量得分相关性:")
print(scores_x.corrwith(scores_y))

实际应用示例

分析心理健康数据:

from pycorrana import cca
import pandas as pd

df = pd.read_csv('psychology_data.csv')

# 心理测量变量
psychological = df[['anxiety', 'depression', 'stress']]

# 生理测量变量
physiological = df[['heart_rate', 'blood_pressure', 'cortisol']]

result = cca(psychological, physiological)

print("心理-生理典型相关系数:", result['canonical_correlations'])

# 解读第一对典型变量
print("\n心理变量权重:", result['x_weights'][:, 0])
print("生理变量权重:", result['y_weights'][:, 0])

高级用法

自定义分析流程

from pycorrana import CorrAnalyzer
from pycorrana.utils.data_utils import load_data, infer_types
from pycorrana.utils.stats_utils import correct_pvalues

df = load_data('data.csv')

type_mapping = infer_types(df)
numeric_cols = [k for k, v in type_mapping.items() if v == 'numeric']

analyzer = CorrAnalyzer(df[numeric_cols], method='pearson')
result = analyzer.fit()

pvalues = result['pvalue_matrix'].values.flatten()
pvalues = pvalues[~np.isnan(pvalues)]
corrected = correct_pvalues(pvalues.tolist(), method='bonferroni')

批量处理多个文件

import os
from pycorrana import quick_corr

data_dir = 'data/'
output_dir = 'results/'

for filename in os.listdir(data_dir):
    if filename.endswith('.csv'):
        filepath = os.path.join(data_dir, filename)
        output_path = os.path.join(output_dir, f'{filename}_results.xlsx')

        result = quick_corr(
            filepath,
            export=output_path,
            plot=False,
            verbose=False
        )

        print(f"Processed: {filename}")

结合 pandas 分析

import pandas as pd
from pycorrana import CorrAnalyzer

df = pd.read_csv('data.csv')

grouped = df.groupby('category')

for name, group in grouped:
    print(f"\n=== Group: {name} ===")
    analyzer = CorrAnalyzer(group)
    result = analyzer.fit()
    print(analyzer.summary())