162140117陈思远
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.ioff()
dataset_path = '../dataset/data.csv'
result_path = '../result/'
if not os.path.exists(result_path):
os.mkdir(result_path)
data = pd.read_csv(dataset_path, index_col=0, header=0)
data["Diagnosis"].replace({"M": 1, "B": 0}, inplace=True)
data
Diagnosis | radius1 | texture1 | perimeter1 | area1 | smoothness1 | compactness1 | concavity1 | concave_points1 | symmetry1 | ... | radius3 | texture3 | perimeter3 | area3 | smoothness3 | compactness3 | concavity3 | concave_points3 | symmetry3 | fractal_dimension3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | |||||||||||||||||||||
842302 | 1 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | 0.2419 | ... | 25.380 | 17.33 | 184.60 | 2019.0 | 0.16220 | 0.66560 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
842517 | 1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | 0.1812 | ... | 24.990 | 23.41 | 158.80 | 1956.0 | 0.12380 | 0.18660 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
84300903 | 1 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | 0.2069 | ... | 23.570 | 25.53 | 152.50 | 1709.0 | 0.14440 | 0.42450 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
84348301 | 1 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | 0.2597 | ... | 14.910 | 26.50 | 98.87 | 567.7 | 0.20980 | 0.86630 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
84358402 | 1 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | 0.1809 | ... | 22.540 | 16.67 | 152.20 | 1575.0 | 0.13740 | 0.20500 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
926424 | 1 | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0.11590 | 0.24390 | 0.13890 | 0.1726 | ... | 25.450 | 26.40 | 166.10 | 2027.0 | 0.14100 | 0.21130 | 0.4107 | 0.2216 | 0.2060 | 0.07115 |
926682 | 1 | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0.10340 | 0.14400 | 0.09791 | 0.1752 | ... | 23.690 | 38.25 | 155.00 | 1731.0 | 0.11660 | 0.19220 | 0.3215 | 0.1628 | 0.2572 | 0.06637 |
926954 | 1 | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | ... | 18.980 | 34.12 | 126.70 | 1124.0 | 0.11390 | 0.30940 | 0.3403 | 0.1418 | 0.2218 | 0.07820 |
927241 | 1 | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.35140 | 0.15200 | 0.2397 | ... | 25.740 | 39.42 | 184.60 | 1821.0 | 0.16500 | 0.86810 | 0.9387 | 0.2650 | 0.4087 | 0.12400 |
92751 | 0 | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.00000 | 0.00000 | 0.1587 | ... | 9.456 | 30.37 | 59.16 | 268.6 | 0.08996 | 0.06444 | 0.0000 | 0.0000 | 0.2871 | 0.07039 |
569 rows × 31 columns
# z-score
y = data.to_numpy()[:, 0]
x = data.to_numpy()[:, 1:]
x = (x - x.mean(axis=0)) / x.std(axis=0)
x
array([[ 1.09706398, -2.07333501, 1.26993369, ..., 2.29607613, 2.75062224, 1.93701461], [ 1.82982061, -0.35363241, 1.68595471, ..., 1.0870843 , -0.24388967, 0.28118999], [ 1.57988811, 0.45618695, 1.56650313, ..., 1.95500035, 1.152255 , 0.20139121], ..., [ 0.70228425, 2.0455738 , 0.67267578, ..., 0.41406869, -1.10454895, -0.31840916], [ 1.83834103, 2.33645719, 1.98252415, ..., 2.28998549, 1.91908301, 2.21963528], [-1.80840125, 1.22179204, -1.81438851, ..., -1.74506282, -0.04813821, -0.75120669]])
import seaborn as sns
sns.set_style("whitegrid")
for i in range(3):
fig, axes = plt.subplots(nrows=1, ncols=10, figsize=(20, 2), facecolor=None)
for j in range(10):
sns.histplot(data.iloc[:, i*10+j+1], kde=True, ax=axes[j], color='skyblue', alpha=0.7)
axes[j].set_xlabel(data.columns[i*10+j+1])
if j == 0:
axes[j].set_ylabel('Frequency')
else:
axes[j].set_ylabel(None)
plt.tight_layout()
plt.savefig(result_path + f"Attribute_hist_{i + 1}.svg", format='svg')
from scipy.stats import shapiro
from matplotlib.cm import get_cmap
exam_table = {'Attribute': list(data.columns)[1:],
'Statistic': [],
'P-value': [],
'Normality': []}
for j in range(data.shape[1]-1):
statistic, p_value = shapiro(data.iloc[:, j+1])
exam_table['Statistic'].append(statistic)
exam_table['P-value'].append(p_value)
exam_table['Normality'].append(p_value > 0.05)
pd.DataFrame(exam_table).set_index('Attribute').to_excel(result_path + 'normality_test.xlsx')
# 绘制直方图可视化检验结果p-value
cmap = get_cmap('Reds')
plt.figure(figsize=(10, 3))
plt.axhline(-np.log10(0.05), color='red', linestyle='dashed', linewidth=2, label='alpha=0.05')
plt.bar(range(1, len(exam_table['P-value']) + 1), -np.log10(exam_table['P-value']),
color=[cmap(np.log10(p)/min(np.log10(exam_table['P-value']))) for p in exam_table['P-value']])
plt.xlabel('Attributes')
plt.ylabel('-log10(p-value)')
plt.legend()
plt.xticks(range(1, len(exam_table['P-value']) + 1), data.columns[1:], rotation='vertical')
plt.savefig(result_path + f'normality_test.svg', format='svg', bbox_inches = 'tight')
pd.DataFrame(exam_table).set_index('Attribute')
Statistic | P-value | Normality | |
---|---|---|---|
Attribute | |||
radius1 | 0.941070 | 3.106155e-14 | False |
texture1 | 0.976720 | 7.281474e-08 | False |
perimeter1 | 0.936183 | 7.011630e-15 | False |
area1 | 0.858402 | 3.196511e-22 | False |
smoothness1 | 0.987487 | 8.599346e-05 | False |
compactness1 | 0.916978 | 3.967941e-17 | False |
concavity1 | 0.866831 | 1.338583e-21 | False |
concave_points1 | 0.891650 | 1.404436e-19 | False |
symmetry1 | 0.972588 | 7.881996e-09 | False |
fractal_dimension1 | 0.923284 | 1.956494e-16 | False |
radius2 | 0.745554 | 1.224555e-28 | False |
texture2 | 0.896168 | 3.560367e-19 | False |
perimeter2 | 0.718164 | 7.587721e-30 | False |
area2 | 0.563820 | 2.652721e-35 | False |
smoothness2 | 0.838410 | 1.361936e-23 | False |
compactness2 | 0.836878 | 1.082880e-23 | False |
concavity2 | 0.672070 | 1.101694e-31 | False |
concave_points2 | 0.919706 | 7.826044e-17 | False |
symmetry2 | 0.828387 | 3.127302e-24 | False |
fractal_dimension2 | 0.695079 | 8.551023e-31 | False |
radius3 | 0.913493 | 1.704317e-17 | False |
texture3 | 0.982562 | 2.564881e-06 | False |
perimeter3 | 0.912585 | 1.373282e-17 | False |
area3 | 0.816070 | 5.595767e-25 | False |
smoothness3 | 0.988620 | 2.097363e-04 | False |
compactness3 | 0.891065 | 1.247598e-19 | False |
concavity3 | 0.917527 | 4.542876e-17 | False |
concave_points3 | 0.964836 | 1.985903e-10 | False |
symmetry3 | 0.916144 | 3.234118e-17 | False |
fractal_dimension3 | 0.889546 | 9.195530e-20 | False |
data_mean = np.mean(data.iloc[:, 1:].to_numpy(), axis=0)
data_variance_max_like = []
data_variance_unbiased = []
for j in range(data.shape[1]-1):
data_variance_unbiased.append(np.var(data.iloc[:, j+1], ddof=1))
data_variance_max_like.append(np.var(data.iloc[:, j+1], ddof=0))
estimate_table = {'Attribute': list(data.columns)[1:],
'mean': data_mean,
'max likelihood variance': data_variance_max_like,
'unbiased variance': data_variance_unbiased}
pd.DataFrame(estimate_table).set_index('Attribute').to_excel(result_path + 'max_like_unbiase.xlsx')
pd.DataFrame(estimate_table).set_index('Attribute')
mean | max likelihood variance | unbiased variance | |
---|---|---|---|
Attribute | |||
radius1 | 14.127292 | 12.397094 | 12.418920 |
texture1 | 19.289649 | 18.466397 | 18.498909 |
perimeter1 | 91.969033 | 589.402799 | 590.440480 |
area1 | 654.889104 | 123625.903080 | 123843.554318 |
smoothness1 | 0.096360 | 0.000197 | 0.000198 |
compactness1 | 0.104341 | 0.002784 | 0.002789 |
concavity1 | 0.088799 | 0.006344 | 0.006355 |
concave_points1 | 0.048919 | 0.001503 | 0.001506 |
symmetry1 | 0.181162 | 0.000750 | 0.000752 |
fractal_dimension1 | 0.062798 | 0.000050 | 0.000050 |
radius2 | 0.405172 | 0.076767 | 0.076902 |
texture2 | 1.216853 | 0.303781 | 0.304316 |
perimeter2 | 2.866059 | 4.080711 | 4.087896 |
area2 | 40.337079 | 2065.794621 | 2069.431583 |
smoothness2 | 0.007041 | 0.000009 | 0.000009 |
compactness2 | 0.025478 | 0.000320 | 0.000321 |
concavity2 | 0.031894 | 0.000910 | 0.000911 |
concave_points2 | 0.011796 | 0.000038 | 0.000038 |
symmetry2 | 0.020542 | 0.000068 | 0.000068 |
fractal_dimension2 | 0.003795 | 0.000007 | 0.000007 |
radius3 | 16.269190 | 23.319169 | 23.360224 |
texture3 | 25.677223 | 37.710092 | 37.776483 |
perimeter3 | 107.261213 | 1127.146434 | 1129.130847 |
area3 | 880.583128 | 323597.670893 | 324167.385102 |
smoothness3 | 0.132369 | 0.000520 | 0.000521 |
compactness3 | 0.254265 | 0.024711 | 0.024755 |
concavity3 | 0.272188 | 0.043448 | 0.043524 |
concave_points3 | 0.114606 | 0.004313 | 0.004321 |
symmetry3 | 0.290076 | 0.003821 | 0.003828 |
fractal_dimension3 | 0.083946 | 0.000326 | 0.000326 |
from utils import three_population_mean_wilks_test
alpha = 0.01
print(x[:, :10].mean(axis=0), x[:, 10:20].mean(axis=0), x[:, 20:30].mean(axis=0))
_, p_value = three_population_mean_wilks_test(x[:, :10], x[:, 10:20], x[:, 20:30], alpha)
print(f'alpha:{alpha}, p_value:{p_value}, ' + ('接受三总体均值向量相等' if p_value > alpha else '三总体均值向量至少两个不等'))
[-1.37363271e-16 6.86816353e-17 -1.24875700e-16 -2.18532476e-16 -8.36667193e-16 1.87313551e-16 4.99502802e-17 -4.99502802e-17 1.74825981e-16 4.74527662e-16] [ 2.37263831e-16 -1.12388130e-16 -1.12388130e-16 -1.31119486e-16 -1.52972733e-16 1.74825981e-16 1.62338411e-16 0.00000000e+00 8.74129903e-17 -6.24378502e-18] [-8.24179623e-16 1.24875700e-17 -3.74627101e-16 0.00000000e+00 -2.37263831e-16 -3.37164391e-16 7.49254203e-17 2.24776261e-16 2.62238971e-16 -5.74428222e-16] alpha:0.01, p_value:1.0, 接受三总体均值向量相等
from utils import two_population_mean_t_square_test
one_test_table = {'Attribute': list(data.columns)[1:] + ['total'],
'F value': [],
'probability': []}
for j in range(x.shape[1]):
F, P = two_population_mean_t_square_test(x[y==1, j].reshape(-1, 1), x[y==0, j].reshape(-1, 1))
one_test_table['F value'].append(F.item())
one_test_table['probability'].append(P.item())
F, P = two_population_mean_t_square_test(x[y==1, :], x[y==0, :])
one_test_table['F value'].append(F.item())
one_test_table['probability'].append(P.item())
pd.DataFrame(one_test_table).set_index('Attribute').to_excel(result_path + 'two_pop_mean_test.xlsx')
pd.DataFrame(one_test_table).set_index('Attribute')
F value | probability | |
---|---|---|
Attribute | ||
radius1 | 158648.550490 | 1.110223e-16 |
texture1 | 33954.857713 | 1.110223e-16 |
perimeter1 | 169959.179001 | 1.110223e-16 |
area1 | 130683.235640 | 1.110223e-16 |
smoothness1 | 24104.742267 | 1.110223e-16 |
compactness1 | 78854.493752 | 1.110223e-16 |
concavity1 | 132067.280936 | 1.110223e-16 |
concave_points1 | 203854.992953 | 1.110223e-16 |
symmetry1 | 19168.344304 | 1.110223e-16 |
fractal_dimension1 | 25.716686 | 5.363553e-07 |
radius2 | 60475.395925 | 1.110223e-16 |
texture2 | 11.638275 | 6.921148e-04 |
perimeter2 | 56616.514363 | 1.110223e-16 |
area2 | 52129.518051 | 1.110223e-16 |
smoothness2 | 735.781851 | 1.110223e-16 |
compactness2 | 14644.728409 | 1.110223e-16 |
concavity2 | 12186.141124 | 1.110223e-16 |
concave_points2 | 32389.719054 | 1.110223e-16 |
symmetry2 | 6.228518 | 1.285395e-02 |
fractal_dimension2 | 1070.971098 | 1.110223e-16 |
radius3 | 203622.657485 | 1.110223e-16 |
texture3 | 42528.121664 | 1.110223e-16 |
perimeter3 | 212141.231806 | 1.110223e-16 |
area3 | 146278.506093 | 1.110223e-16 |
smoothness3 | 33935.502134 | 1.110223e-16 |
compactness3 | 74208.953058 | 1.110223e-16 |
concavity3 | 115839.231622 | 1.110223e-16 |
concave_points3 | 255791.246766 | 1.110223e-16 |
symmetry3 | 29182.209247 | 1.110223e-16 |
fractal_dimension3 | 16822.157486 | 1.110223e-16 |
total | 16043.174191 | 1.110223e-16 |
sns.set_style("whitegrid")
color_map = ['red', 'green', 'orange']
fig, axes = plt.subplots(nrows=1, ncols=10, figsize=(15, 2), facecolor=None)
for j in range(10):
for i in range(3):
sns.kdeplot(x[:, i*10+j], ax=axes[j], color=color_map[i], label=i ,shade=False)
axes[j].set_xlabel(data.columns[j+1].replace('1', ''))
if j == 0 or j == 5:
axes[j].set_ylabel('Frequency')
else:
axes[j].set_ylabel(None)
plt.legend()
plt.tight_layout()
plt.savefig(result_path + f"Nuclear_density_curve_different_cell.svg", format='svg')
# 绘制不同label的核密度曲线对比
sns.set_style("whitegrid")
for i in range(3):
fig, axes = plt.subplots(nrows=1, ncols=10, figsize=(15, 2), facecolor=None)
for j in range(10):
sns.kdeplot(x[y==1, i*10+j], ax=axes[j], color='OrangeRed', label='M' ,shade=False)
sns.kdeplot(x[y==0, i*10+j], ax=axes[j], color='DodgerBlue', label='B' ,shade=False)
axes[j].set_xlabel(data.columns[i*10+j+1])
if j == 0:
axes[j].set_ylabel('Density')
else:
axes[j].set_ylabel(None)
plt.legend()
plt.tight_layout()
plt.savefig(result_path + f"Nuclear_density_curve_different_label_{i}.svg", format='svg')
plt.figure(figsize=(15, 9))
for i in range(3):
if i == 0:
ax = plt.subplot2grid((2, 3), (0, 0), colspan=2, rowspan=2)
else:
ax = plt.subplot2grid((2, 3), (i-1, 2))
correlation_matrix = np.corrcoef(x[:, i*10:(i+1)*10], rowvar=False)
sns.heatmap(correlation_matrix, vmax=1,
xticklabels=data.columns[i*10+1:(i+1)*10+1],
yticklabels=data.columns[i*10+1:(i+1)*10+1])
plt.tight_layout()
plt.savefig(result_path + f"Correlation_Heatmap.svg", format='svg', bbox_inches='tight')
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
reg_x = x[:, 1:10]
reg_y = x[:, 0]
n, m = reg_x.shape
reg = LinearRegression()
reg.fit(reg_x, reg_y)
pred_y = reg.predict(reg_x)
train_mse = mean_squared_error(reg_y, pred_y)
print(f"拟合均方误差:{train_mse}")
plt.figure(figsize=(15, 6))
for i in range(9):
if i == 1:
ax = plt.subplot2grid((2, 6), (0, 0), colspan=2, rowspan=2)
elif i == 0:
ax = plt.subplot2grid((2, 6), (0, 2))
else:
ax = plt.subplot2grid((2, 6), (i//5, i%5+i//5+1))
show_index = i
mask = np.zeros(reg_x.shape[1])
mask[show_index] = 1
plt.scatter(reg_x[:, show_index], reg_y, s=3, label='Data points', color="DodgerBlue")
plt.plot(reg_x[:, show_index], reg.coef_[show_index]*reg_x[:, show_index]+reg.intercept_, color='OrangeRed', label='Regression Line')
plt.xlabel(data.columns[show_index+2], fontsize=15)
plt.ylabel(data.columns[1], fontsize=15)
if i == 1:
plt.legend(fontsize=15)
plt.tight_layout()
plt.savefig(result_path + f"regression_nanlysis.svg", format='svg', transparent=True)
拟合均方误差:0.000622352731429911
from scipy.cluster.hierarchy import linkage, dendrogram
distance_methods = ['ward', 'single', 'complete']
for method in distance_methods:
plt.figure(figsize=(12, 8))
linkage_matrix = linkage(x, method='ward')
dendrogram(linkage_matrix, no_labels=True)
plt.ylabel('distance', fontsize=15)
plt.title(f'hierarchical diagram with {method}', fontsize=15)
plt.savefig(result_path + f'hierarchical_diagram_with_{method}.svg', format='svg')
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
x_pca = pca.fit_transform(x)
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(x_pca)
labels = kmeans.labels_
centers = kmeans.cluster_centers_
if sum(labels[y==1])/len(labels[y==1]) < 0.5:
labels = labels==0
plt.figure(figsize=(5, 5))
plt.scatter(x_pca[labels==0, 0], x_pca[labels==0, 1], s=20, color='DodgerBlue', alpha=0.7, label='B')
plt.scatter(x_pca[labels==1, 0], x_pca[labels==1, 1], s=20, color='OrangeRed', alpha=0.7, label='M')
plt.scatter(centers[:, 0], centers[:, 1], marker='o', s=50, color='red', edgecolors='black', linewidths=2, label='Cluster Center')
plt.legend()
plt.savefig(result_path + f'kmeans_cluster.svg', format='svg')
right_B = [i for i in range(x.shape[0]) if labels[i]==0 and y[i]==0]
wrong_B = [i for i in range(x.shape[0]) if labels[i]==0 and y[i]==1]
right_M = [i for i in range(x.shape[0]) if labels[i]==1 and y[i]==1]
wrong_M = [i for i in range(x.shape[0]) if labels[i]==1 and y[i]==0]
plt.figure(figsize=(5, 5))
plt.scatter(x_pca[right_B, 0], x_pca[right_B, 1], s=20, color='LightSkyBlue', alpha=0.7, label='right B')
plt.scatter(x_pca[wrong_B, 0], x_pca[wrong_B, 1], s=20, color='OrangeRed', alpha=0.7, marker='x', label='wrong B')
plt.scatter(x_pca[right_M, 0], x_pca[right_M, 1], s=20, color='LightPink', alpha=0.7, label='right M')
plt.scatter(x_pca[wrong_M, 0], x_pca[wrong_M, 1], s=20, color='DodgerBlue', alpha=0.7, marker='x', label='wrong M')
plt.scatter(centers[:, 0], centers[:, 1], marker='o', s=50, color='red', edgecolors='black', linewidths=2, label='Cluster Center')
plt.legend()
plt.savefig(result_path + f'kmeans_discriminate.svg', format='svg')
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:, 1:].to_numpy(), y, test_size=0.3, stratify=y, random_state=6)
x_train_mean_M = x_train[y_train==1].mean(axis=0)
x_train_cov_M = np.cov(x_train[y_train==1], rowvar=False, ddof=1)
x_train_mean_B = x_train[y_train==0].mean(axis=0)
x_train_cov_B = np.cov(x_train[y_train==0], rowvar=False, ddof=1)
distance_from_M = np.dot(np.dot((x_test - x_train_mean_M), np.linalg.inv(x_train_cov_M)), (x_test - x_train_mean_M).T)
distance_from_B = np.dot(np.dot((x_test - x_train_mean_B), np.linalg.inv(x_train_cov_B)), (x_test - x_train_mean_B).T)
pred = (distance_from_M - distance_from_B) > 0
distance_discrimination_accuracy = (pred == y_test).mean()
print(f"accuracy: {distance_discrimination_accuracy}")
accuracy: 0.26083239287302074
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(x_train, y_train)
y_pred = lda.predict(x_test)
accuracy = (y_pred == y_test).mean()
print(f"accuracy: {accuracy}")
accuracy: 0.9707602339181286
pca = PCA(n_components=2)
x_pca = pca.fit_transform(x_train)
lda = LinearDiscriminantAnalysis()
X_lda = lda.fit_transform(x_pca, y_train)
plt.figure(figsize=(15, 10))
# 绘制数据点
plt.scatter(x_pca[y_train == 1, 0], x_pca[y_train == 1, 1], marker='o', s=30,
label='M', color='OrangeRed', edgecolors='black', linewidths=0.2, alpha=0.8)
plt.scatter(x_pca[y_train == 0, 0], x_pca[y_train == 0, 1], marker='o', s=30,
label='B', color='DodgerBlue', edgecolors='black', linewidths=0.2, alpha=0.8)
# 计算决策函数的值
decision_values = lda.decision_function(x_pca)
x_min, x_max = (x_pca[:, 0].min(), x_pca[:, 0].max())
y_min, y_max = (x_pca[:, 1].min(), x_pca[:, 1].max())
# 绘制判别直线
intercept = lda.intercept_[0]
slope1 = -lda.coef_[0, 0] / lda.coef_[0, 1]
x_line = np.linspace(x_min, x_max, 100)
y_line = slope1 * x_line - intercept / lda.coef_[0, 1]
plt.plot(x_line, y_line, color='blue', linestyle='--', label='LDA Decision line')
# 绘制投影直线
slope2 = lda.coef_[0, 1] / lda.coef_[0, 0]
x_line = np.linspace(x_min, x_max, 100)
y_line = slope2 * x_line - intercept / lda.coef_[0, 1] / slope1 *slope2 - 200
plt.plot(x_line, y_line, color='red', linestyle='--', label='LDA Project line')
# 绘制pca投影至一维的投影直线
pca = PCA(n_components=1)
x_pca_pca = pca.fit_transform(x_pca)
components = pca.components_
projection_line = np.outer(x_pca_pca, components[0])
plt.plot(projection_line[:, 0], projection_line[:, 1], color='green', label='PCA Projection Line', linewidth=2)
plt.xlim([x_min*1.2, x_max*1.1])
plt.ylim([y_min*1.2, y_max*1.2])
plt.xlabel('feature 1', fontsize=15)
plt.ylabel('feature 2', fontsize=15)
plt.legend(fontsize=13)
plt.gca().set_aspect(1)
plt.savefig(result_path + f'LDA_discriminate.svg', format='svg', bbox_inches='tight')
# 使用PCA进行主成分分析
pca = PCA()
pca.fit(x)
# 计算每个主成分的解释的方差百分比
explained_variance_ratio = pca.explained_variance_ratio_
# 绘制累积解释方差百分比的曲线
cmap = get_cmap('Reds')
cumulative_explained_variance = np.cumsum(explained_variance_ratio)
for i, ratio in enumerate(cumulative_explained_variance, 1):
plt.plot(i, np.cumsum(ratio), marker='o', color=cmap(i / len(cumulative_explained_variance)))
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.savefig(result_path + f'PCA_explain_variance.svg', format='svg')
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:, 1:].to_numpy(), y, test_size=0.3, stratify=y, random_state=6)
accuracy_list = []
for component_num in range(1, x_train.shape[1]+1):
# PCA降维
pca = PCA(n_components=component_num)
x_pca_train = pca.fit_transform(x_train)
x_pca_test = pca.fit_transform(x_test)
# 马氏距离判别
x_train_mean_M = x_pca_train[y_train==1].mean(axis=0)
x_train_cov_M = np.matrix(np.cov(x_pca_train[y_train==1], rowvar=False, ddof=1))
x_train_mean_B = x_pca_train[y_train==0].mean(axis=0)
x_train_cov_B = np.matrix(np.cov(x_pca_train[y_train==0], rowvar=False, ddof=1))
distance_from_M = np.dot(np.dot((x_pca_test - x_train_mean_M), np.linalg.inv(x_train_cov_M)), (x_pca_test - x_train_mean_M).T)
distance_from_B = np.dot(np.dot((x_pca_test - x_train_mean_B), np.linalg.inv(x_train_cov_B)), (x_pca_test - x_train_mean_B).T)
pred = (distance_from_M - distance_from_B) > 0
accuracy = (pred == y_test).mean()
accuracy_list.append(accuracy)
plt.figure(figsize=(5, 5))
for component_num, accuracy in enumerate(accuracy_list, start=1):
plt.plot(component_num, accuracy, marker='o', color=cmap((accuracy-min(accuracy_list))/(max(accuracy_list)-min(accuracy_list))+0.2))
plt.plot([0, len(accuracy_list)], [distance_discrimination_accuracy, distance_discrimination_accuracy],
color='Crimson', label='without PCA')
plt.xlabel("Number of Principal Components", fontsize=13)
plt.ylabel("Accuracy of distance discrimination", fontsize=13)
plt.legend(fontsize=11)
plt.savefig(result_path + f'PCA_enhanced_distance_discriminate.svg', format='svg', bbox_inches='tight')
from factor_analyzer import FactorAnalyzer
def fa(rotation=None):
n_factors = x.shape[1]
fa = FactorAnalyzer(n_factors, rotation=rotation)
# 拟合模型并获取因子载荷矩阵
fa.fit(data.iloc[:, 1:].to_numpy())
loadings = fa.loadings_
# 绘制 scree plot
ev, v = fa.get_eigenvalues()
plt.figure(figsize=(5, 4))
for eigenvalue_index, eigenvalue in enumerate(ev, start=1):
plt.plot(eigenvalue_index, eigenvalue, marker='o', color=cmap((eigenvalue-min(ev))/(max(ev)-min(ev))+0.2))
plt.title(f'Scree Plot (rotation:{rotation})')
plt.xlabel('index of factor')
plt.ylabel('eigenvalue')
plt.savefig(result_path + f'FA_eigenvalue_rotation_{rotation}.svg', format='svg')
plt.figure(figsize=(11, 9))
sns.heatmap(loadings, xticklabels=[f'Factor {i+1}' for i in range(n_factors)], yticklabels=data.columns[1:])
plt.title(f'Factor Loadings Heatmap (rotation:{rotation})')
plt.savefig(result_path + f'FA_loadings_heatmap_rotation_{rotation}.svg', format='svg', bbox_inches = 'tight')
fa(rotation=None)
fa(rotation="varimax")
from factor_analyzer import FactorAnalyzer
n_factors = 15
fa = FactorAnalyzer(n_factors, rotation='varimax') # 这里使用 varimax 旋转
# 拟合模型并获取因子载荷矩阵
fa.fit(data.iloc[:, 1:].to_numpy())
loadings = fa.loadings_
plt.figure(figsize=(15, 7))
sns.heatmap(loadings.T, xticklabels=data.columns[1:], yticklabels=[f'Factor {i+1}' for i in range(n_factors)])
plt.savefig(result_path + f'FA_loadings_15_heatmap_rotation_varimax.svg', format='svg', bbox_inches = 'tight')
# 可视化共同度
communalities = fa.get_communalities()
cmap = get_cmap('Reds')
plt.figure(figsize=(10, 3))
plt.bar(range(1, len(communalities) + 1), communalities,
color=[cmap(comm) for comm in np.interp(communalities, (communalities.min(), communalities.max()), (0, 1))])
plt.ylabel('Communality')
plt.xlabel('Variable')
plt.xticks(range(1, len(communalities) + 1), data.columns[1:], rotation='vertical')
plt.savefig(result_path + f'FA_communalities_of_each_variable.svg', format='svg', bbox_inches = 'tight')
# 获取每个因子的贡献度
plt.figure(figsize=(8, 4))
factor_variances = fa.get_factor_variance()
plt.bar(range(1, n_factors + 1), factor_variances[0], color=[cmap(vari/max(factor_variances[0])) for vari in factor_variances[0]])
plt.xlabel('Factor')
plt.ylabel('Factor Variance')
plt.savefig(result_path + f'FA_contribution_of_each_factor_to_total_variance.svg', format='svg')
n_factors = 2
fa = FactorAnalyzer(n_factors, rotation='varimax')
fa.fit(data.iloc[:, 1:].to_numpy())
# 获取因子载荷矩阵
loadings = fa.loadings_
# 计算因子得分
factor_scores = fa.transform(data.iloc[:, 1:].to_numpy())
# 可视化因子得分
plt.figure(figsize=(8, 6))
plt.scatter(x=factor_scores[:, 0][y==1], y=factor_scores[:, 1][y==1],
marker='o', s=50, label='M', color='OrangeRed', edgecolors='black', linewidths=0.2, alpha=0.7)
plt.scatter(x=factor_scores[:, 0][y==0], y=factor_scores[:, 1][y==0],
marker='o', s=50, label='B', color='DodgerBlue', edgecolors='black', linewidths=0.2, alpha=0.7)
plt.xlabel('Factor 1 Score')
plt.ylabel('Factor 2 Score')
plt.legend()
plt.gca().set_aspect(1)
plt.savefig(result_path + f'FA_factor_scores_visualization.svg', format='svg')
from utils import LDA_reduce_dim, PCA_reduce_dim, FA_reduce_dim
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=6)
x_train_lda = LDA_reduce_dim(x_train, y_train, reduced_dim=1)
x_test_lda = LDA_reduce_dim(x_test, y_test, reduced_dim=1)
x_train_pca = PCA_reduce_dim(x_train, reduced_dim=1)
x_test_pca = PCA_reduce_dim(x_test, reduced_dim=1)
x_train_pca_2 = PCA_reduce_dim(x_train, reduced_dim=2)
x_test_pca_2 = PCA_reduce_dim(x_test, reduced_dim=2)
x_train_pca_best = PCA_reduce_dim(x_train, reduced_dim=15)
x_test_pca_best = PCA_reduce_dim(x_test, reduced_dim=15)
x_train_fa = FA_reduce_dim(x_train, reduced_dim=1)
x_test_fa = FA_reduce_dim(x_test, reduced_dim=1)
x_train_fa_2 = FA_reduce_dim(x_train, reduced_dim=2)
x_test_fa_2 = FA_reduce_dim(x_test, reduced_dim=2)
x_train_fa_best = FA_reduce_dim(x_train, reduced_dim=15)
x_test_fa_best = FA_reduce_dim(x_test, reduced_dim=15)
final_analysis_record_table = {"分类方法": ["广义平方距离判别", "kmeans聚类判别", "支持向量机", "随机森林"],
"LDA": [],
"PCA": [],
"FA": [],
"PCA-2": [],
"FA-2": [],
"PCA-15": [],
"FA-15": []}
# 一维
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
axes[0].hist(x_train_lda[y_train==1], bins=20, alpha=0.8, label='M', color="OrangeRed")
axes[0].hist(x_train_lda[y_train==0], bins=20, alpha=0.8, label='B', color='DodgerBlue')
axes[0].set_xlabel('One-Dimensional Data', fontsize=14)
axes[0].set_ylabel('Frequency', fontsize=14)
axes[0].set_title('LDA', fontsize=16)
axes[0].legend()
axes[1].hist(x_train_pca[y_train==1], bins=20, alpha=0.8, label='M', color="OrangeRed")
axes[1].hist(x_train_pca[y_train==0], bins=20, alpha=0.8, label='B', color='DodgerBlue')
axes[1].set_xlabel('One-Dimensional Data', fontsize=14)
axes[1].set_ylabel('Frequency', fontsize=14)
axes[1].set_title('PCA', fontsize=16)
axes[1].legend()
axes[2].hist(x_train_fa[y_train==1], bins=20, alpha=0.8, label='M', color="OrangeRed")
axes[2].hist(x_train_fa[y_train==0], bins=20, alpha=0.8, label='B', color='DodgerBlue')
axes[2].set_xlabel('One-Dimensional Data', fontsize=14)
axes[2].set_ylabel('Frequency', fontsize=14)
axes[2].set_title('FA', fontsize=16)
axes[2].legend()
plt.tight_layout()
plt.savefig(result_path + f'summary_reduce_dim_one_dimensional_data.svg', format='svg', bbox_inches='tight')
# 二维
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
axes[0].scatter(x=x_train_pca_2[:, 0][y_train==1], y=x_train_pca_2[:, 1][y_train==1], s=50, edgecolors='black', linewidths=0.2, label='M', color='OrangeRed', alpha=0.7)
axes[0].scatter(x=x_train_pca_2[:, 0][y_train==0], y=x_train_pca_2[:, 1][y_train==0], s=50, edgecolors='black', linewidths=0.2, label='B', color='DodgerBlue', alpha=0.7)
axes[0].set_xlabel('PCA component1', fontsize=14)
axes[0].set_ylabel('PCA component2', fontsize=14)
axes[0].set_title('PCA', fontsize=16)
axes[0].legend()
axes[1].scatter(x=x_train_fa_2[:, 0][y_train==1], y=x_train_fa_2[:, 1][y_train==1], edgecolors='black', linewidths=0.2, s=50, label='M', color='OrangeRed', alpha=0.7)
axes[1].scatter(x=x_train_fa_2[:, 0][y_train==0], y=x_train_fa_2[:, 1][y_train==0], edgecolors='black', linewidths=0.2, s=50, label='B', color='DodgerBlue', alpha=0.7)
axes[1].set_xlabel('FA factor1', fontsize=14)
axes[1].set_ylabel('FA factor2', fontsize=14)
axes[1].set_title('FA', fontsize=16)
axes[1].legend()
plt.tight_layout()
plt.savefig(result_path + f'summary_reduce_dim_two_dimensional_data.svg', format='svg', bbox_inches='tight')
from utils import distance_classifier
y_pred_lda = distance_classifier(x_train_lda, y_train, x_test_lda)
y_pred_pca = distance_classifier(x_train_pca, y_train, x_test_pca)
y_pred_fa = distance_classifier(x_train_fa, y_train, x_test_fa)
y_pred_pca_2 = distance_classifier(x_train_pca_2, y_train, x_test_pca_2)
y_pred_fa_2 = distance_classifier(x_train_fa_2, y_train, x_test_fa_2)
y_pred_pca_best = distance_classifier(x_train_pca_best, y_train, x_test_pca_best)
y_pred_fa_best = distance_classifier(x_train_fa_best, y_train, x_test_fa_best)
final_analysis_record_table["LDA"].append((y_test == y_pred_lda).mean())
final_analysis_record_table["PCA"].append((y_test == y_pred_pca).mean())
final_analysis_record_table["FA"].append((y_test == y_pred_fa).mean())
final_analysis_record_table["PCA-2"].append((y_test == y_pred_pca_2).mean())
final_analysis_record_table["FA-2"].append((y_test == y_pred_fa_2).mean())
final_analysis_record_table["PCA-15"].append((y_test == y_pred_pca_best).mean())
final_analysis_record_table["FA-15"].append((y_test == y_pred_fa_best).mean())
from utils import kmeans_classifier
y_pred_lda = kmeans_classifier(x_train_lda, y_train, x_test_lda)
y_pred_pca = kmeans_classifier(x_train_pca, y_train, x_test_pca)
y_pred_fa = kmeans_classifier(x_train_fa, y_train, x_test_fa)
y_pred_pca_2 = kmeans_classifier(x_train_pca_2, y_train, x_test_pca_2)
y_pred_fa_2 = kmeans_classifier(x_train_fa_2, y_train, x_test_fa_2)
y_pred_pca_best = kmeans_classifier(x_train_pca_best, y_train, x_test_pca_best)
y_pred_fa_best = kmeans_classifier(x_train_fa_best, y_train, x_test_fa_best)
final_analysis_record_table["LDA"].append((y_test == y_pred_lda).mean())
final_analysis_record_table["PCA"].append((y_test == y_pred_pca).mean())
final_analysis_record_table["FA"].append((y_test == y_pred_fa).mean())
final_analysis_record_table["PCA-2"].append((y_test == y_pred_pca_2).mean())
final_analysis_record_table["FA-2"].append((y_test == y_pred_fa_2).mean())
final_analysis_record_table["PCA-15"].append((y_test == y_pred_pca_best).mean())
final_analysis_record_table["FA-15"].append((y_test == y_pred_fa_best).mean())
from utils import svm_classfier
y_pred_lda = svm_classfier(x_train_lda, y_train, x_test_lda)
y_pred_pca = svm_classfier(x_train_pca, y_train, x_test_pca)
y_pred_fa = svm_classfier(x_train_fa, y_train, x_test_fa)
y_pred_pca_2 = svm_classfier(x_train_pca_2, y_train, x_test_pca_2)
y_pred_fa_2 = svm_classfier(x_train_fa_2, y_train, x_test_fa_2)
y_pred_pca_best = svm_classfier(x_train_pca_best, y_train, x_test_pca_best)
y_pred_fa_best = svm_classfier(x_train_fa_best, y_train, x_test_fa_best)
final_analysis_record_table["LDA"].append((y_test == y_pred_lda).mean())
final_analysis_record_table["PCA"].append((y_test == y_pred_pca).mean())
final_analysis_record_table["FA"].append((y_test == y_pred_fa).mean())
final_analysis_record_table["PCA-2"].append((y_test == y_pred_pca_2).mean())
final_analysis_record_table["FA-2"].append((y_test == y_pred_fa_2).mean())
final_analysis_record_table["PCA-15"].append((y_test == y_pred_pca_best).mean())
final_analysis_record_table["FA-15"].append((y_test == y_pred_fa_best).mean())
from utils import rf_classifier
y_pred_lda = rf_classifier(x_train_lda, y_train, x_test_lda)
y_pred_pca = rf_classifier(x_train_pca, y_train, x_test_pca)
y_pred_fa = rf_classifier(x_train_fa, y_train, x_test_fa)
y_pred_pca_2 = rf_classifier(x_train_pca_2, y_train, x_test_pca_2)
y_pred_fa_2 = rf_classifier(x_train_fa_2, y_train, x_test_fa_2)
y_pred_pca_best = rf_classifier(x_train_pca_best, y_train, x_test_pca_best)
y_pred_fa_best = rf_classifier(x_train_fa_best, y_train, x_test_fa_best)
final_analysis_record_table["LDA"].append((y_test == y_pred_lda).mean())
final_analysis_record_table["PCA"].append((y_test == y_pred_pca).mean())
final_analysis_record_table["FA"].append((y_test == y_pred_fa).mean())
final_analysis_record_table["PCA-2"].append((y_test == y_pred_pca_2).mean())
final_analysis_record_table["FA-2"].append((y_test == y_pred_fa_2).mean())
final_analysis_record_table["PCA-15"].append((y_test == y_pred_pca_best).mean())
final_analysis_record_table["FA-15"].append((y_test == y_pred_fa_best).mean())
pd.DataFrame(final_analysis_record_table).set_index("分类方法").to_excel(result_path + 'final_compare_result.xlsx')
pd.DataFrame(final_analysis_record_table).set_index("分类方法")
LDA | PCA | FA | PCA-2 | FA-2 | PCA-15 | FA-15 | |
---|---|---|---|---|---|---|---|
分类方法 | |||||||
广义平方距离判别 | 0.988304 | 0.912281 | 0.105263 | 0.947368 | 0.935673 | 0.912281 | 0.877193 |
kmeans聚类判别 | 0.982456 | 0.912281 | 0.105263 | 0.906433 | 0.941520 | 0.923977 | 0.947368 |
支持向量机 | 0.988304 | 0.906433 | 0.081871 | 0.947368 | 0.923977 | 0.953216 | 0.959064 |
随机森林 | 0.976608 | 0.888889 | 0.134503 | 0.935673 | 0.923977 | 0.947368 | 0.912281 |