相关分析

python
数据分析
R语言
Correlation Funnel提供了一种快速有效的 EDA 工具。它通过可视化的方式,帮助用户轻松地发现数据中的相关性,并从中提取有价值的洞察。
作者

不止BI

发布于

2024年4月1日

在数据科学项目中,探索性数据分析 (EDA) 是一个至关重要的阶段。它旨在了解数据的基本特征,发现潜在的模式和趋势,为后续的建模和分析奠定基础。然而,传统 的 EDA 方法通常需要大量的时间和精力,尤其是当面对复杂的大型数据集时。

Correlation Funnel提供了一种快速有效的 EDA 工具。它通过可视化的方式,帮助用户轻松地发现数据中的相关性,并从中提取有价值的洞察。

加载数据集

代码
import pandas as pd 
import pytimetk as tk

df_penguins = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv")
df_penguins.head(5)
  species     island  bill_length_mm  ...  flipper_length_mm  body_mass_g     sex
0  Adelie  Torgersen            39.1  ...              181.0       3750.0    MALE
1  Adelie  Torgersen            39.5  ...              186.0       3800.0  FEMALE
2  Adelie  Torgersen            40.3  ...              195.0       3250.0  FEMALE
3  Adelie  Torgersen             NaN  ...                NaN          NaN     NaN
4  Adelie  Torgersen            36.7  ...              193.0       3450.0  FEMALE

[5 rows x 7 columns]
代码
library(tidyverse)
df_penguins = read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv")
df_penguins %>% head(5)
# A tibble: 5 × 7
  species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
  <chr>   <chr>              <dbl>         <dbl>             <dbl>       <dbl>
1 Adelie  Torgersen           39.1          18.7               181        3750
2 Adelie  Torgersen           39.5          17.4               186        3800
3 Adelie  Torgersen           40.3          18                 195        3250
4 Adelie  Torgersen           NA            NA                  NA          NA
5 Adelie  Torgersen           36.7          19.3               193        3450
# ℹ 1 more variable: sex <chr>

离散化数据

代码
df_penguins_clean = df_penguins \
    .drop_zero_variance() \
    .dropna()
    
df_penguins_clean.glimpse()
<class 'pandas.core.frame.DataFrame'>: 333 rows of 7 columns
species:            object            ['Adelie', 'Adelie', 'Adelie', 'Ad ...
island:             object            ['Torgersen', 'Torgersen', 'Torger ...
bill_length_mm:     float64           [39.1, 39.5, 40.3, 36.7, 39.3, 38. ...
bill_depth_mm:      float64           [18.7, 17.4, 18.0, 19.3, 20.6, 17. ...
flipper_length_mm:  float64           [181.0, 186.0, 195.0, 193.0, 190.0 ...
body_mass_g:        float64           [3750.0, 3800.0, 3250.0, 3450.0, 3 ...
sex:                object            ['MALE', 'FEMALE', 'FEMALE', 'FEMA ...
代码
df_penguins_binarize = df_penguins_clean.binarize(n_bins = 4,thresh_infreq = 0.05) # 将分类变量中占比小于5%的划分为其他

df_penguins_binarize.glimpse()
<class 'pandas.core.frame.DataFrame'>: 333 rows of 24 columns
bill_length_mm__32.1_39.5:       int32             [1, 1, 0, 1, 1, 1, 1, ...
bill_length_mm__39.5_44.5:       int32             [0, 0, 1, 0, 0, 0, 0, ...
bill_length_mm__44.5_48.6:       int32             [0, 0, 0, 0, 0, 0, 0, ...
bill_length_mm__48.6_59.6:       int32             [0, 0, 0, 0, 0, 0, 0, ...
bill_depth_mm__13.1_15.6:        int32             [0, 0, 0, 0, 0, 0, 0, ...
bill_depth_mm__15.6_17.3:        int32             [0, 0, 0, 0, 0, 0, 0, ...
bill_depth_mm__17.3_18.7:        int32             [1, 1, 1, 0, 0, 1, 0, ...
bill_depth_mm__18.7_21.5:        int32             [0, 0, 0, 1, 1, 0, 1, ...
flipper_length_mm__172.0_190.0:  int32             [1, 1, 0, 0, 1, 1, 0, ...
flipper_length_mm__190.0_197.0:  int32             [0, 0, 1, 1, 0, 0, 1, ...
flipper_length_mm__197.0_213.0:  int32             [0, 0, 0, 0, 0, 0, 0, ...
flipper_length_mm__213.0_231.0:  int32             [0, 0, 0, 0, 0, 0, 0, ...
body_mass_g__2700.0_3550.0:      int32             [0, 0, 1, 1, 0, 0, 0, ...
body_mass_g__3550.0_4050.0:      int32             [1, 1, 0, 0, 1, 1, 0, ...
body_mass_g__4050.0_4775.0:      int32             [0, 0, 0, 0, 0, 0, 1, ...
body_mass_g__4775.0_6300.0:      int32             [0, 0, 0, 0, 0, 0, 0, ...
species__Adelie:                 int32             [1, 1, 1, 1, 1, 1, 1, ...
species__Chinstrap:              int32             [0, 0, 0, 0, 0, 0, 0, ...
species__Gentoo:                 int32             [0, 0, 0, 0, 0, 0, 0, ...
island__Biscoe:                  int32             [0, 0, 0, 0, 0, 0, 0, ...
island__Dream:                   int32             [0, 0, 0, 0, 0, 0, 0, ...
island__Torgersen:               int32             [1, 1, 1, 1, 1, 1, 1, ...
sex__FEMALE:                     int32             [0, 1, 1, 1, 0, 1, 0, ...
sex__MALE:                       int32             [1, 0, 0, 0, 1, 0, 1, ...
代码
library(correlationfunnel)
df_penguins_clean = df_penguins %>% 
    na.omit()
    
df_penguins_clean %>% glimpse()
Rows: 333
Columns: 7
$ species           <chr> "Adelie", "Adelie", "Adelie", "Adelie", "Adelie", "A…
$ island            <chr> "Torgersen", "Torgersen", "Torgersen", "Torgersen", …
$ bill_length_mm    <dbl> 39.1, 39.5, 40.3, 36.7, 39.3, 38.9, 39.2, 41.1, 38.6…
$ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, 19.3, 20.6, 17.8, 19.6, 17.6, 21.2…
$ flipper_length_mm <dbl> 181, 186, 195, 193, 190, 181, 195, 182, 191, 198, 18…
$ body_mass_g       <dbl> 3750, 3800, 3250, 3450, 3650, 3625, 4675, 3200, 3800…
$ sex               <chr> "MALE", "FEMALE", "FEMALE", "FEMALE", "MALE", "FEMAL…
代码
df_penguins_binarize = df_penguins_clean %>% 
  binarize(n_bins = 4, thresh_infreq = 0.05)
# 将分类变量中占比小于5%的划分为其他

df_penguins_binarize %>% glimpse()
Rows: 333
Columns: 24
$ species__Adelie               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ species__Chinstrap            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ species__Gentoo               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ island__Biscoe                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ island__Dream                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ island__Torgersen             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ `bill_length_mm__-Inf_39.5`   <dbl> 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1…
$ bill_length_mm__39.5_44.5     <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0…
$ bill_length_mm__44.5_48.6     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ bill_length_mm__48.6_Inf      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ `bill_depth_mm__-Inf_15.6`    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ bill_depth_mm__15.6_17.3      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ bill_depth_mm__17.3_18.7      <dbl> 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1…
$ bill_depth_mm__18.7_Inf       <dbl> 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0…
$ `flipper_length_mm__-Inf_190` <dbl> 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1…
$ flipper_length_mm__190_197    <dbl> 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0…
$ flipper_length_mm__197_213    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
$ flipper_length_mm__213_Inf    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ `body_mass_g__-Inf_3550`      <dbl> 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1…
$ body_mass_g__3550_4050        <dbl> 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0…
$ body_mass_g__4050_4775        <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0…
$ body_mass_g__4775_Inf         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ sex__FEMALE                   <dbl> 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1…
$ sex__MALE                     <dbl> 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0…

计算相关性

代码
df_penguins_Adelie_cor = df_penguins_binarize.correlate('species__Adelie')

df_penguins_Adelie_cor
              feature            bin  correlation
16            species         Adelie     1.000000
0      bill_length_mm      32.1_39.5     0.667798
18            species         Gentoo    -0.658904
8   flipper_length_mm    172.0_190.0     0.513977
3      bill_length_mm      48.6_59.6    -0.509125
15        body_mass_g  4775.0_6300.0    -0.509125
4       bill_depth_mm      13.1_15.6    -0.503415
11  flipper_length_mm    213.0_231.0    -0.500954
2      bill_length_mm      44.5_48.6    -0.467153
21             island      Torgersen     0.458786
17            species      Chinstrap    -0.447597
12        body_mass_g  2700.0_3550.0     0.363589
19             island         Biscoe    -0.332518
6       bill_depth_mm      17.3_18.7     0.329427
1      bill_length_mm      39.5_44.5     0.303089
10  flipper_length_mm    197.0_213.0    -0.261152
7       bill_depth_mm      18.7_21.5     0.232803
9   flipper_length_mm    190.0_197.0     0.226091
13        body_mass_g  3550.0_4050.0     0.170002
5       bill_depth_mm      15.6_17.3    -0.053355
14        body_mass_g  4050.0_4775.0    -0.031412
20             island          Dream     0.013443
22                sex         FEMALE     0.007961
23                sex           MALE    -0.007961
代码
df_penguins_Adelie_cor = df_penguins_binarize %>%
  correlate(target =species__Adelie)

df_penguins_Adelie_cor
# A tibble: 24 × 3
   feature           bin       correlation
   <fct>             <chr>           <dbl>
 1 species           Adelie          1    
 2 bill_length_mm    -Inf_39.5       0.668
 3 species           Gentoo         -0.659
 4 flipper_length_mm -Inf_190        0.514
 5 bill_length_mm    48.6_Inf       -0.509
 6 body_mass_g       4775_Inf       -0.509
 7 bill_depth_mm     -Inf_15.6      -0.503
 8 flipper_length_mm 213_Inf        -0.501
 9 bill_length_mm    44.5_48.6      -0.467
10 island            Torgersen       0.459
# ℹ 14 more rows

绘制相关性图

代码
df_penguins_Adelie_cor.plot_correlation_funnel(
    engine = 'plotly'
)
代码
# df_penguins_Adelie_cor.plot_correlation_funnel(
#     engine = 'plotnine'
# )
代码
df_penguins_Adelie_cor %>%
    plot_correlation_funnel(interactive = T)

从上图可知:

  • Adelie企鹅与 bill_length_mm 在 32.1_39.5,flipper_length_mm 在 172.0_190.0最为相关

  • Adelie企鹅与Gentoo 企鹅差异较大

回到顶部