import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from ipywidgets import *
%matplotlib inline
Let us make some elementary charts and plots to understand what the data is like.
df_PCOS_all = pd.read_excel('PCOS_data_without_infertility.xlsx', sheet_name = 'Full_new')
df_PCOS_all.head()
Sl. No | Patient File No. | PCOS (Y/N) | Age (yrs) | Weight (Kg) | Height(Cm) | BMI | Blood Group | Pulse rate(bpm) | RR (breaths/min) | ... | Fast food (Y/N) | Reg.Exercise(Y/N) | BP _Systolic (mmHg) | BP _Diastolic (mmHg) | Follicle No. (L) | Follicle No. (R) | Avg. F size (L) (mm) | Avg. F size (R) (mm) | Endometrium (mm) | Unnamed: 44 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 0 | 28 | 44.6 | 152.0 | 19.300000 | 15 | 78 | 22 | ... | 1.0 | 0 | 110 | 80 | 3 | 3 | 18.0 | 18.0 | 8.5 | NaN |
1 | 2 | 2 | 0 | 36 | 65.0 | 161.5 | 24.921163 | 15 | 74 | 20 | ... | 0.0 | 0 | 120 | 70 | 3 | 5 | 15.0 | 14.0 | 3.7 | NaN |
2 | 3 | 3 | 1 | 33 | 68.8 | 165.0 | 25.270891 | 11 | 72 | 18 | ... | 1.0 | 0 | 120 | 80 | 13 | 15 | 18.0 | 20.0 | 10.0 | NaN |
3 | 4 | 4 | 0 | 37 | 65.0 | 148.0 | 29.674945 | 13 | 72 | 20 | ... | 0.0 | 0 | 120 | 70 | 2 | 2 | 15.0 | 14.0 | 7.5 | NaN |
4 | 5 | 5 | 0 | 25 | 52.0 | 161.0 | 20.060954 | 11 | 72 | 18 | ... | 0.0 | 0 | 120 | 80 | 3 | 4 | 16.0 | 14.0 | 7.0 | NaN |
5 rows × 45 columns
df_PCOS_all.describe()
Sl. No | Patient File No. | PCOS (Y/N) | Age (yrs) | Weight (Kg) | Height(Cm) | BMI | Blood Group | Pulse rate(bpm) | RR (breaths/min) | ... | Pimples(Y/N) | Fast food (Y/N) | Reg.Exercise(Y/N) | BP _Systolic (mmHg) | BP _Diastolic (mmHg) | Follicle No. (L) | Follicle No. (R) | Avg. F size (L) (mm) | Avg. F size (R) (mm) | Endometrium (mm) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 541.000000 | 541.000000 | 541.000000 | 541.000000 | 541.000000 | 541.000000 | 541.000000 | 541.000000 | 541.000000 | 541.000000 | ... | 541.000000 | 540.000000 | 541.000000 | 541.000000 | 541.000000 | 541.000000 | 541.000000 | 541.000000 | 541.000000 | 541.000000 |
mean | 271.000000 | 271.000000 | 0.327172 | 31.430684 | 59.637153 | 156.484835 | 24.311285 | 13.802218 | 73.247689 | 19.243993 | ... | 0.489834 | 0.514815 | 0.247689 | 114.661738 | 76.927911 | 6.129390 | 6.641405 | 15.018115 | 15.451701 | 8.475915 |
std | 156.317519 | 156.317519 | 0.469615 | 5.411006 | 11.028287 | 6.033545 | 4.056399 | 1.840812 | 4.430285 | 1.688629 | ... | 0.500359 | 0.500244 | 0.432070 | 7.384556 | 5.574112 | 4.229294 | 4.436889 | 3.566839 | 3.318848 | 2.165381 |
min | 1.000000 | 1.000000 | 0.000000 | 20.000000 | 31.000000 | 137.000000 | 12.417882 | 11.000000 | 13.000000 | 16.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 12.000000 | 8.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 136.000000 | 136.000000 | 0.000000 | 28.000000 | 52.000000 | 152.000000 | 21.641274 | 13.000000 | 72.000000 | 18.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 110.000000 | 70.000000 | 3.000000 | 3.000000 | 13.000000 | 13.000000 | 7.000000 |
50% | 271.000000 | 271.000000 | 0.000000 | 31.000000 | 59.000000 | 156.000000 | 24.238227 | 14.000000 | 72.000000 | 18.000000 | ... | 0.000000 | 1.000000 | 0.000000 | 110.000000 | 80.000000 | 5.000000 | 6.000000 | 15.000000 | 16.000000 | 8.500000 |
75% | 406.000000 | 406.000000 | 1.000000 | 35.000000 | 65.000000 | 160.000000 | 26.634958 | 15.000000 | 74.000000 | 20.000000 | ... | 1.000000 | 1.000000 | 0.000000 | 120.000000 | 80.000000 | 9.000000 | 10.000000 | 18.000000 | 18.000000 | 9.800000 |
max | 541.000000 | 541.000000 | 1.000000 | 48.000000 | 108.000000 | 180.000000 | 38.900000 | 18.000000 | 82.000000 | 28.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 140.000000 | 100.000000 | 22.000000 | 20.000000 | 24.000000 | 24.000000 | 18.000000 |
8 rows × 42 columns
df_PCOS_all.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 45 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Sl. No 541 non-null int64
1 Patient File No. 541 non-null int64
2 PCOS (Y/N) 541 non-null int64
3 Age (yrs) 541 non-null int64
4 Weight (Kg) 541 non-null float64
5 Height(Cm) 541 non-null float64
6 BMI 541 non-null float64
7 Blood Group 541 non-null int64
8 Pulse rate(bpm) 541 non-null int64
9 RR (breaths/min) 541 non-null int64
10 Hb(g/dl) 541 non-null float64
11 Cycle(R/I) 541 non-null int64
12 Cycle length(days) 541 non-null int64
13 Marraige Status (Yrs) 540 non-null float64
14 Pregnant(Y/N) 541 non-null int64
15 No. of aborptions 541 non-null int64
16 I beta-HCG(mIU/mL) 541 non-null float64
17 II beta-HCG(mIU/mL) 541 non-null object
18 FSH(mIU/mL) 541 non-null float64
19 LH(mIU/mL) 541 non-null float64
20 FSH/LH 541 non-null float64
21 Hip(inch) 541 non-null int64
22 Waist(inch) 541 non-null int64
23 Waist:Hip Ratio 541 non-null float64
24 TSH (mIU/L) 541 non-null float64
25 AMH(ng/mL) 541 non-null object
26 PRL(ng/mL) 541 non-null float64
27 Vit D3 (ng/mL) 541 non-null float64
28 PRG(ng/mL) 541 non-null float64
29 RBS(mg/dl) 541 non-null float64
30 Weight gain(Y/N) 541 non-null int64
31 hair growth(Y/N) 541 non-null int64
32 Skin darkening (Y/N) 541 non-null int64
33 Hair loss(Y/N) 541 non-null int64
34 Pimples(Y/N) 541 non-null int64
35 Fast food (Y/N) 540 non-null float64
36 Reg.Exercise(Y/N) 541 non-null int64
37 BP _Systolic (mmHg) 541 non-null int64
38 BP _Diastolic (mmHg) 541 non-null int64
39 Follicle No. (L) 541 non-null int64
40 Follicle No. (R) 541 non-null int64
41 Avg. F size (L) (mm) 541 non-null float64
42 Avg. F size (R) (mm) 541 non-null float64
43 Endometrium (mm) 541 non-null float64
44 Unnamed: 44 2 non-null object
dtypes: float64(19), int64(23), object(3)
memory usage: 190.3+ KB
print(df_PCOS_all["Marraige Status (Yrs)"].isnull().sum())
index = df_PCOS_all.index[df_PCOS_all['AMH(ng/mL)'] == "a"]
df_PCOS_all = df_PCOS_all.drop(index)
Let’s find all the missing values
Let us convert the Y/N columns to boolean type.
df_PCOS_all["Cycle(R/I)"] = np.where(df_PCOS_all["Cycle(R/I)"] > 2, True, False)
columns_to_convert = df_PCOS_all.filter(like = 'Y/N').columns
df_PCOS_all[columns_to_convert] = df_PCOS_all[columns_to_convert].astype('bool')
Let us check if it has worked.
df_PCOS_all.info()
<class 'pandas.core.frame.DataFrame'>
Index: 540 entries, 0 to 540
Data columns (total 45 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Sl. No 540 non-null int64
1 Patient File No. 540 non-null int64
2 PCOS (Y/N) 540 non-null bool
3 Age (yrs) 540 non-null int64
4 Weight (Kg) 540 non-null float64
5 Height(Cm) 540 non-null float64
6 BMI 540 non-null float64
7 Blood Group 540 non-null int64
8 Pulse rate(bpm) 540 non-null int64
9 RR (breaths/min) 540 non-null int64
10 Hb(g/dl) 540 non-null float64
11 Cycle(R/I) 540 non-null bool
12 Cycle length(days) 540 non-null int64
13 Marraige Status (Yrs) 539 non-null float64
14 Pregnant(Y/N) 540 non-null bool
15 No. of aborptions 540 non-null int64
16 I beta-HCG(mIU/mL) 540 non-null float64
17 II beta-HCG(mIU/mL) 540 non-null object
18 FSH(mIU/mL) 540 non-null float64
19 LH(mIU/mL) 540 non-null float64
20 FSH/LH 540 non-null float64
21 Hip(inch) 540 non-null int64
22 Waist(inch) 540 non-null int64
23 Waist:Hip Ratio 540 non-null float64
24 TSH (mIU/L) 540 non-null float64
25 AMH(ng/mL) 540 non-null object
26 PRL(ng/mL) 540 non-null float64
27 Vit D3 (ng/mL) 540 non-null float64
28 PRG(ng/mL) 540 non-null float64
29 RBS(mg/dl) 540 non-null float64
30 Weight gain(Y/N) 540 non-null bool
31 hair growth(Y/N) 540 non-null bool
32 Skin darkening (Y/N) 540 non-null bool
33 Hair loss(Y/N) 540 non-null bool
34 Pimples(Y/N) 540 non-null bool
35 Fast food (Y/N) 540 non-null bool
36 Reg.Exercise(Y/N) 540 non-null bool
37 BP _Systolic (mmHg) 540 non-null int64
38 BP _Diastolic (mmHg) 540 non-null int64
39 Follicle No. (L) 540 non-null int64
40 Follicle No. (R) 540 non-null int64
41 Avg. F size (L) (mm) 540 non-null float64
42 Avg. F size (R) (mm) 540 non-null float64
43 Endometrium (mm) 540 non-null float64
44 Unnamed: 44 2 non-null object
dtypes: bool(10), float64(18), int64(14), object(3)
memory usage: 173.3+ KB
Apparently there is one numeric value mis-typed as 1.99. rather than 1.99 which makes it difficult for the column to be converted to float, let’s correct that.
print(df_PCOS_all['II beta-HCG(mIU/mL)'] == "1.99.")
print(df_PCOS_all.loc[123, 'II beta-HCG(mIU/mL)'])
df_PCOS_all.loc[123, 'II beta-HCG(mIU/mL)'] = '1.99'
#df_PCOS_all = df_PCOS_all.drop(302)
print(df_PCOS_all.loc[123, 'II beta-HCG(mIU/mL)'])
0 False
1 False
2 False
3 False
4 False
...
536 False
537 False
538 False
539 False
540 False
Name: II beta-HCG(mIU/mL), Length: 540, dtype: bool
1.99.
1.99
Make some plots to understand how the data looks like
columns_to_drop = ['Unnamed: 44', 'Sl. No', 'Patient File No.', 'Blood Group']
#columns_to_drop = ['Sl. No', 'Patient File No.', 'Blood Group']
df_temp = df_PCOS_all.select_dtypes(exclude = 'bool')
df_temp = df_temp.drop(columns_to_drop, axis = 1)
#df_temp = df_temp.drop[df_temp.index[df_temp['AMH(ng/mL)'] == "a"]]
df_temp[df_temp.columns] = df_temp[df_temp.columns].astype('float64')
variables = Dropdown(options = list(df_temp.columns), description = 'Variables')
output = Output()
def make_plot(*args):
output.clear_output(wait=True)
with output:
sns.histplot(x = variables.value, data= df_temp, kde = True)
plt.show()
variables.observe(make_plot, names = 'value')
display(variables, output)
Dropdown(description='Variables', options=(' Age (yrs)', 'Weight (Kg)', 'Height(Cm) ', 'BMI', 'Pulse rate(bpm)…
Output()
Let’s understand the correlations
sns.clustermap(df_temp.corr(), cmap="rocket_r")
<seaborn.matrix.ClusterGrid at 0x13bf76a50>
df_temp.corr().style.background_gradient(cmap='coolwarm')
Age (yrs) | Weight (Kg) | Height(Cm) | BMI | Pulse rate(bpm) | RR (breaths/min) | Hb(g/dl) | Cycle(R/I) | Cycle length(days) | Marraige Status (Yrs) | No. of aborptions | I beta-HCG(mIU/mL) | II beta-HCG(mIU/mL) | FSH(mIU/mL) | LH(mIU/mL) | FSH/LH | Hip(inch) | Waist(inch) | Waist:Hip Ratio | TSH (mIU/L) | AMH(ng/mL) | PRL(ng/mL) | Vit D3 (ng/mL) | PRG(ng/mL) | RBS(mg/dl) | BP _Systolic (mmHg) | BP _Diastolic (mmHg) | Follicle No. (L) | Follicle No. (R) | Avg. F size (L) (mm) | Avg. F size (R) (mm) | Endometrium (mm) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Age (yrs) | 1.000000 | -0.029136 | -0.118577 | 0.021316 | 0.045553 | 0.086628 | -0.022778 | -0.084871 | 0.055898 | 0.662254 | 0.221843 | 0.008511 | 0.043378 | -0.017708 | 0.000603 | 0.012432 | -0.001346 | 0.036263 | 0.066452 | 0.010245 | -0.179648 | -0.046077 | 0.004454 | -0.021775 | 0.097086 | 0.072313 | 0.069329 | -0.109965 | -0.158865 | -0.017435 | -0.079646 | -0.101969 |
Weight (Kg) | -0.029136 | 1.000000 | 0.419901 | 0.901755 | 0.020089 | 0.043901 | 0.009979 | 0.200470 | -0.002284 | 0.043991 | 0.093310 | 0.015882 | -0.000521 | -0.025785 | -0.029910 | -0.004831 | 0.633920 | 0.639589 | 0.014976 | 0.071410 | 0.031050 | -0.050016 | 0.008145 | 0.069688 | 0.114294 | 0.028066 | 0.130843 | 0.173501 | 0.124092 | -0.021036 | -0.073115 | -0.010932 |
Height(Cm) | -0.118577 | 0.419901 | 1.000000 | -0.006906 | -0.074144 | -0.028862 | 0.025259 | -0.018208 | 0.009595 | -0.066407 | -0.026240 | 0.062079 | 0.036474 | 0.030883 | -0.045619 | 0.022065 | 0.215357 | 0.209348 | -0.008982 | 0.018501 | -0.045208 | -0.018178 | -0.034997 | 0.049654 | 0.050437 | -0.067029 | 0.009420 | 0.105574 | 0.074896 | -0.025959 | 0.059686 | -0.055987 |
BMI | 0.021316 | 0.901755 | -0.006906 | 1.000000 | 0.050536 | 0.061932 | 0.003534 | 0.232891 | -0.006231 | 0.084015 | 0.109865 | -0.009967 | -0.015270 | -0.040717 | -0.013312 | -0.012076 | 0.597058 | 0.607524 | 0.023602 | 0.072305 | 0.054023 | -0.047721 | 0.027035 | 0.049460 | 0.093543 | 0.069549 | 0.140134 | 0.142903 | 0.104205 | -0.011594 | -0.111519 | 0.009320 |
Pulse rate(bpm) | 0.045553 | 0.020089 | -0.074144 | 0.050536 | 1.000000 | 0.303727 | -0.052265 | 0.101241 | 0.006411 | 0.038701 | 0.046227 | -0.020437 | -0.016226 | -0.013071 | -0.032315 | -0.013104 | 0.062951 | 0.037880 | -0.052890 | -0.051482 | -0.049843 | 0.020750 | -0.001486 | -0.017678 | 0.042002 | -0.025751 | 0.008027 | 0.040559 | 0.049307 | -0.048546 | -0.034256 | -0.040891 |
RR (breaths/min) | 0.086628 | 0.043901 | -0.028862 | 0.061932 | 0.303727 | 1.000000 | -0.041027 | 0.018850 | 0.004972 | 0.077701 | -0.006090 | -0.085028 | -0.039017 | -0.032388 | -0.031211 | -0.043339 | 0.075020 | 0.038310 | -0.075567 | -0.011968 | -0.017480 | 0.007284 | -0.009016 | -0.076895 | 0.050811 | 0.016734 | 0.053751 | 0.070179 | 0.012752 | -0.031527 | -0.022035 | -0.062941 |
Hb(g/dl) | -0.022778 | 0.009979 | 0.025259 | 0.003534 | -0.052265 | -0.041027 | 1.000000 | 0.037428 | -0.051992 | 0.006743 | 0.060702 | -0.016649 | -0.094425 | -0.047398 | -0.089107 | -0.039827 | -0.024712 | -0.000779 | 0.057161 | -0.026156 | 0.056172 | -0.063091 | 0.063916 | 0.065769 | 0.024067 | 0.052229 | 0.002046 | 0.061814 | 0.073422 | 0.031995 | 0.024153 | -0.065041 |
Cycle(R/I) | -0.084871 | 0.200470 | -0.018208 | 0.232891 | 0.101241 | 0.018850 | 0.037428 | 1.000000 | -0.201044 | -0.033557 | -0.057937 | 0.063098 | 0.027930 | -0.026083 | -0.021393 | -0.016167 | 0.174305 | 0.168856 | -0.004003 | -0.017435 | 0.194063 | 0.005251 | 0.096717 | -0.033781 | -0.006394 | 0.055790 | 0.080057 | 0.296114 | 0.251271 | 0.034112 | 0.016204 | 0.042168 |
Cycle length(days) | 0.055898 | -0.002284 | 0.009595 | -0.006231 | 0.006411 | 0.004972 | -0.051992 | -0.201044 | 1.000000 | 0.117850 | 0.004023 | 0.020187 | 0.018577 | 0.029645 | -0.001686 | 0.025937 | 0.040354 | -0.023479 | -0.130093 | -0.003191 | -0.072822 | 0.016415 | -0.058345 | 0.007190 | 0.000210 | -0.011963 | -0.075792 | -0.086809 | -0.161256 | -0.052364 | -0.013956 | -0.016506 |
Marraige Status (Yrs) | 0.662254 | 0.043991 | -0.066407 | 0.084015 | 0.038701 | 0.077701 | 0.006743 | -0.033557 | 0.117850 | 1.000000 | 0.246713 | 0.111666 | 0.112905 | -0.023461 | 0.035509 | -0.002983 | 0.038325 | 0.057376 | 0.033334 | -0.040241 | -0.146482 | -0.025819 | 0.041541 | -0.052840 | 0.075519 | 0.028478 | 0.005745 | -0.079079 | -0.087227 | -0.072229 | -0.097536 | -0.105882 |
No. of aborptions | 0.221843 | 0.093310 | -0.026240 | 0.109865 | 0.046227 | -0.006090 | 0.060702 | -0.057937 | 0.004023 | 0.246713 | 1.000000 | 0.057762 | 0.046977 | -0.018187 | -0.018876 | -0.026472 | 0.078418 | 0.073280 | -0.004031 | 0.032359 | -0.052920 | -0.064655 | 0.015516 | -0.023962 | -0.013134 | -0.083221 | 0.070745 | -0.058061 | -0.078688 | -0.056589 | -0.117573 | -0.067758 |
I beta-HCG(mIU/mL) | 0.008511 | 0.015882 | 0.062079 | -0.009967 | -0.020437 | -0.085028 | -0.016649 | 0.063098 | 0.020187 | 0.111666 | 0.057762 | 1.000000 | 0.533687 | 0.001641 | -0.006914 | 0.007892 | 0.014833 | 0.005173 | -0.021490 | -0.044710 | 0.014428 | -0.013383 | -0.012659 | -0.002984 | -0.023509 | -0.081720 | 0.003923 | 0.048325 | 0.018266 | 0.050100 | 0.071863 | -0.051920 |
II beta-HCG(mIU/mL) | 0.043378 | -0.000521 | 0.036474 | -0.015270 | -0.016226 | -0.039017 | -0.094425 | 0.027930 | 0.018577 | 0.112905 | 0.046977 | 0.533687 | 1.000000 | 0.016752 | -0.008092 | 0.027150 | 0.015716 | 0.016541 | 0.001982 | -0.036839 | 0.003668 | -0.001337 | -0.007552 | 0.005649 | -0.004830 | -0.059202 | 0.004021 | 0.064106 | 0.037524 | 0.000003 | 0.037088 | 0.017013 |
FSH(mIU/mL) | -0.017708 | -0.025785 | 0.030883 | -0.040717 | -0.013071 | -0.032388 | -0.047398 | -0.026083 | 0.029645 | -0.023461 | -0.018187 | 0.001641 | 0.016752 | 1.000000 | -0.001450 | 0.971951 | -0.000158 | 0.013754 | 0.027308 | -0.024805 | -0.014568 | 0.017848 | -0.000683 | -0.003426 | 0.018798 | -0.026830 | 0.023285 | -0.002342 | -0.025358 | 0.011547 | 0.020184 | -0.049158 |
LH(mIU/mL) | 0.000603 | -0.029910 | -0.045619 | -0.013312 | -0.032315 | -0.031211 | -0.089107 | -0.021393 | -0.001686 | 0.035509 | -0.018876 | -0.006914 | -0.008092 | -0.001450 | 1.000000 | -0.005724 | -0.021727 | -0.022304 | -0.002980 | -0.009398 | 0.021055 | 0.046904 | -0.001543 | -0.004080 | -0.015216 | -0.027699 | 0.023964 | -0.001280 | 0.003430 | 0.035632 | 0.032495 | 0.010786 |
FSH/LH | 0.012432 | -0.004831 | 0.022065 | -0.012076 | -0.013104 | -0.043339 | -0.039827 | -0.016167 | 0.025937 | -0.002983 | -0.026472 | 0.007892 | 0.027150 | 0.971951 | -0.005724 | 1.000000 | 0.012370 | 0.027588 | 0.029240 | -0.028331 | -0.007438 | 0.015264 | -0.001750 | -0.005109 | 0.006231 | -0.019302 | 0.026920 | 0.005872 | -0.007506 | 0.014923 | 0.024070 | -0.053768 |
Hip(inch) | -0.001346 | 0.633920 | 0.215357 | 0.597058 | 0.062951 | 0.075020 | -0.024712 | 0.174305 | 0.040354 | 0.038325 | 0.078418 | 0.014833 | 0.015716 | -0.000158 | -0.021727 | 0.012370 | 1.000000 | 0.873589 | -0.241990 | 0.035835 | -0.066106 | -0.082153 | 0.009742 | 0.050008 | -0.001393 | -0.012106 | 0.104944 | 0.119505 | 0.094799 | -0.082276 | -0.104384 | 0.027778 |
Waist(inch) | 0.036263 | 0.639589 | 0.209348 | 0.607524 | 0.037880 | 0.038310 | -0.000779 | 0.168856 | -0.023479 | 0.057376 | 0.073280 | 0.005173 | 0.016541 | 0.013754 | -0.022304 | 0.027588 | 0.873589 | 1.000000 | 0.258365 | -0.008295 | -0.043132 | -0.036364 | 0.018989 | 0.029175 | 0.012995 | 0.034558 | 0.125243 | 0.129846 | 0.093401 | 0.031048 | -0.015271 | 0.012810 |
Waist:Hip Ratio | 0.066452 | 0.014976 | -0.008982 | 0.023602 | -0.052890 | -0.075567 | 0.057161 | -0.004003 | -0.130093 | 0.033334 | -0.004031 | -0.021490 | 0.001982 | 0.027308 | -0.002980 | 0.029240 | -0.241990 | 0.258365 | 1.000000 | -0.085023 | 0.057354 | 0.092356 | 0.017701 | -0.039042 | 0.023170 | 0.088978 | 0.042507 | 0.026456 | -0.002207 | 0.229355 | 0.176453 | -0.026793 |
TSH (mIU/L) | 0.010245 | 0.071410 | 0.018501 | 0.072305 | -0.051482 | -0.011968 | -0.026156 | -0.017435 | -0.003191 | -0.040241 | 0.032359 | -0.044710 | -0.036839 | -0.024805 | -0.009398 | -0.028331 | 0.035835 | -0.008295 | -0.085023 | 1.000000 | -0.010854 | 0.025606 | -0.009114 | -0.020504 | -0.030056 | 0.048215 | 0.055034 | -0.028164 | -0.016807 | -0.091201 | -0.088850 | 0.013863 |
AMH(ng/mL) | -0.179648 | 0.031050 | -0.045208 | 0.054023 | -0.049843 | -0.017480 | 0.056172 | 0.194063 | -0.072822 | -0.146482 | -0.052920 | 0.014428 | 0.003668 | -0.014568 | 0.021055 | -0.007438 | -0.066106 | -0.043132 | 0.057354 | -0.010854 | 1.000000 | -0.072214 | 0.007660 | -0.011195 | 0.008454 | -0.003192 | 0.026038 | 0.203407 | 0.188537 | 0.134628 | 0.096079 | 0.104159 |
PRL(ng/mL) | -0.046077 | -0.050016 | -0.018178 | -0.047721 | 0.020750 | 0.007284 | -0.063091 | 0.005251 | 0.016415 | -0.025819 | -0.064655 | -0.013383 | -0.001337 | 0.017848 | 0.046904 | 0.015264 | -0.082153 | -0.036364 | 0.092356 | 0.025606 | -0.072214 | 1.000000 | -0.006827 | -0.009700 | -0.041676 | -0.009963 | -0.026821 | -0.010803 | -0.008607 | 0.086311 | 0.071688 | 0.027439 |
Vit D3 (ng/mL) | 0.004454 | 0.008145 | -0.034997 | 0.027035 | -0.001486 | -0.009016 | 0.063916 | 0.096717 | -0.058345 | 0.041541 | 0.015516 | -0.012659 | -0.007552 | -0.000683 | -0.001543 | -0.001750 | 0.009742 | 0.018989 | 0.017701 | -0.009114 | 0.007660 | -0.006827 | 1.000000 | -0.007240 | 0.038980 | 0.042855 | -0.024962 | 0.074172 | 0.091775 | 0.005441 | -0.011812 | -0.031385 |
PRG(ng/mL) | -0.021775 | 0.069688 | 0.049654 | 0.049460 | -0.017678 | -0.076895 | 0.065769 | -0.033781 | 0.007190 | -0.052840 | -0.023962 | -0.002984 | 0.005649 | -0.003426 | -0.004080 | -0.005109 | 0.050008 | 0.029175 | -0.039042 | -0.020504 | -0.011195 | -0.009700 | -0.007240 | 1.000000 | -0.003545 | -0.020638 | 0.032131 | 0.018489 | 0.031483 | -0.040698 | -0.040910 | -0.047987 |
RBS(mg/dl) | 0.097086 | 0.114294 | 0.050437 | 0.093543 | 0.042002 | 0.050811 | 0.024067 | -0.006394 | 0.000210 | 0.075519 | -0.013134 | -0.023509 | -0.004830 | 0.018798 | -0.015216 | 0.006231 | -0.001393 | 0.012995 | 0.023170 | -0.030056 | 0.008454 | -0.041676 | 0.038980 | -0.003545 | 1.000000 | 0.052683 | -0.032907 | 0.044342 | 0.027562 | -0.046821 | 0.013137 | -0.018700 |
BP _Systolic (mmHg) | 0.072313 | 0.028066 | -0.067029 | 0.069549 | -0.025751 | 0.016734 | 0.052229 | 0.055790 | -0.011963 | 0.028478 | -0.083221 | -0.081720 | -0.059202 | -0.026830 | -0.027699 | -0.019302 | -0.012106 | 0.034558 | 0.088978 | 0.048215 | -0.003192 | -0.009963 | 0.042855 | -0.020638 | 0.052683 | 1.000000 | 0.102090 | 0.039463 | 0.025515 | 0.051273 | 0.038448 | -0.018613 |
BP _Diastolic (mmHg) | 0.069329 | 0.130843 | 0.009420 | 0.140134 | 0.008027 | 0.053751 | 0.002046 | 0.080057 | -0.075792 | 0.005745 | 0.070745 | 0.003923 | 0.004021 | 0.023285 | 0.023964 | 0.026920 | 0.104944 | 0.125243 | 0.042507 | 0.055034 | 0.026038 | -0.026821 | -0.024962 | 0.032131 | -0.032907 | 0.102090 | 1.000000 | 0.025043 | 0.037844 | 0.037028 | 0.024013 | -0.015342 |
Follicle No. (L) | -0.109965 | 0.173501 | 0.105574 | 0.142903 | 0.040559 | 0.070179 | 0.061814 | 0.296114 | -0.086809 | -0.079079 | -0.058061 | 0.048325 | 0.064106 | -0.002342 | -0.001280 | 0.005872 | 0.119505 | 0.129846 | 0.026456 | -0.028164 | 0.203407 | -0.010803 | 0.074172 | 0.018489 | 0.044342 | 0.039463 | 0.025043 | 1.000000 | 0.799516 | 0.249781 | 0.149002 | 0.078605 |
Follicle No. (R) | -0.158865 | 0.124092 | 0.074896 | 0.104205 | 0.049307 | 0.012752 | 0.073422 | 0.251271 | -0.161256 | -0.087227 | -0.078688 | 0.018266 | 0.037524 | -0.025358 | 0.003430 | -0.007506 | 0.094799 | 0.093401 | -0.002207 | -0.016807 | 0.188537 | -0.008607 | 0.091775 | 0.031483 | 0.027562 | 0.025515 | 0.037844 | 0.799516 | 1.000000 | 0.156640 | 0.188049 | 0.079817 |
Avg. F size (L) (mm) | -0.017435 | -0.021036 | -0.025959 | -0.011594 | -0.048546 | -0.031527 | 0.031995 | 0.034112 | -0.052364 | -0.072229 | -0.056589 | 0.050100 | 0.000003 | 0.011547 | 0.035632 | 0.014923 | -0.082276 | 0.031048 | 0.229355 | -0.091201 | 0.134628 | 0.086311 | 0.005441 | -0.040698 | -0.046821 | 0.051273 | 0.037028 | 0.249781 | 0.156640 | 1.000000 | 0.522794 | -0.014174 |
Avg. F size (R) (mm) | -0.079646 | -0.073115 | 0.059686 | -0.111519 | -0.034256 | -0.022035 | 0.024153 | 0.016204 | -0.013956 | -0.097536 | -0.117573 | 0.071863 | 0.037088 | 0.020184 | 0.032495 | 0.024070 | -0.104384 | -0.015271 | 0.176453 | -0.088850 | 0.096079 | 0.071688 | -0.011812 | -0.040910 | 0.013137 | 0.038448 | 0.024013 | 0.149002 | 0.188049 | 0.522794 | 1.000000 | -0.045628 |
Endometrium (mm) | -0.101969 | -0.010932 | -0.055987 | 0.009320 | -0.040891 | -0.062941 | -0.065041 | 0.042168 | -0.016506 | -0.105882 | -0.067758 | -0.051920 | 0.017013 | -0.049158 | 0.010786 | -0.053768 | 0.027778 | 0.012810 | -0.026793 | 0.013863 | 0.104159 | 0.027439 | -0.031385 | -0.047987 | -0.018700 | -0.018613 | -0.015342 | 0.078605 | 0.079817 | -0.014174 | -0.045628 | 1.000000 |
So far we did nott split the data on PCOS diagnosis. Our next task is to do that. Let’s make to different dataframes:
This should not be strictly necessary, but I am just playing around pandas here.
df_PCOS = df_PCOS_all[df_PCOS_all['PCOS (Y/N)']]
df_no_PCOS = df_PCOS_all[~df_PCOS_all['PCOS (Y/N)']]
df_no_PCOS
Sl. No | Patient File No. | PCOS (Y/N) | Age (yrs) | Weight (Kg) | Height(Cm) | BMI | Blood Group | Pulse rate(bpm) | RR (breaths/min) | ... | Fast food (Y/N) | Reg.Exercise(Y/N) | BP _Systolic (mmHg) | BP _Diastolic (mmHg) | Follicle No. (L) | Follicle No. (R) | Avg. F size (L) (mm) | Avg. F size (R) (mm) | Endometrium (mm) | Unnamed: 44 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | False | 28 | 44.6 | 152.000 | 19.300000 | 15 | 78 | 22 | ... | True | False | 110 | 80 | 3 | 3 | 18.0 | 18.0 | 8.5 | NaN |
1 | 2 | 2 | False | 36 | 65.0 | 161.500 | 24.921163 | 15 | 74 | 20 | ... | False | False | 120 | 70 | 3 | 5 | 15.0 | 14.0 | 3.7 | NaN |
3 | 4 | 4 | False | 37 | 65.0 | 148.000 | 29.674945 | 13 | 72 | 20 | ... | False | False | 120 | 70 | 2 | 2 | 15.0 | 14.0 | 7.5 | NaN |
4 | 5 | 5 | False | 25 | 52.0 | 161.000 | 20.060954 | 11 | 72 | 18 | ... | False | False | 120 | 80 | 3 | 4 | 16.0 | 14.0 | 7.0 | NaN |
5 | 6 | 6 | False | 36 | 74.1 | 165.000 | 27.217631 | 15 | 78 | 28 | ... | False | False | 110 | 70 | 9 | 6 | 16.0 | 20.0 | 8.0 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
535 | 536 | 536 | False | 26 | 80.0 | 161.544 | 30.700000 | 18 | 70 | 18 | ... | False | False | 110 | 80 | 7 | 9 | 13.0 | 17.5 | 9.6 | NaN |
536 | 537 | 537 | False | 35 | 50.0 | 164.592 | 18.500000 | 17 | 72 | 16 | ... | False | False | 110 | 70 | 1 | 0 | 17.5 | 10.0 | 6.7 | NaN |
537 | 538 | 538 | False | 30 | 63.2 | 158.000 | 25.300000 | 15 | 72 | 18 | ... | False | False | 110 | 70 | 9 | 7 | 19.0 | 18.0 | 8.2 | NaN |
538 | 539 | 539 | False | 36 | 54.0 | 152.000 | 23.400000 | 13 | 74 | 20 | ... | False | False | 110 | 80 | 1 | 0 | 18.0 | 9.0 | 7.3 | NaN |
539 | 540 | 540 | False | 27 | 50.0 | 150.000 | 22.200000 | 15 | 74 | 20 | ... | False | False | 110 | 70 | 7 | 6 | 18.0 | 16.0 | 11.5 | NaN |
363 rows × 45 columns
# We have to use Perason's correlation coefficient because the dataframe contains both boolean and integer values.
corr = df_PCOS_all.drop(columns_to_drop, axis = 1).corr(method ='pearson')
# Let's try to extract the variables which have the highest and lowest correlation with PCOS status.
print(f' The three highest {list(corr.nlargest(3, "PCOS (Y/N)")["PCOS (Y/N)"].index)}')
print(f' The three smallest PCOS correlated variables are {list(corr.nsmallest(3, "PCOS (Y/N)")["PCOS (Y/N)"].index)}')
The three highest ['PCOS (Y/N)', 'Follicle No. (R)', 'Follicle No. (L)']
The three smallest PCOS correlated variables are ['Cycle length(days)', ' Age (yrs)', 'Marraige Status (Yrs)']
Do the correlation matrices of PCOS diagnosis vs no PCOS diagnosis look very different?
corrpos = df_PCOS.drop(columns_to_drop, axis = 1).corr()
corrneg = df_no_PCOS.drop(columns_to_drop, axis = 1).corr()
sns.heatmap(corrpos, cmap="Pastel1")
plt.title('Correlations in PCOS diagosed data');
sns.heatmap(corrneg, cmap="Pastel1")
plt.title('Correlations in no PCOS diagosed data');
sns.heatmap(corrpos-corrneg, cmap="Pastel1")
plt.title('Difference between PCOS diagnosed and no PCOS diagnose data');
For simplicity I would want to plot the 10 highest PCOS correlated and 5 lowest correlated variables as a heatmap.
#print(df_PCOS_all.describe())
#columns_to_drop = ['Sl. No', 'Patient File No.', 'Blood Group']
largest_vars = corr.nlargest(10, "PCOS (Y/N)")["PCOS (Y/N)"].index
smallest_vars = corr.nsmallest(5, "PCOS (Y/N)")["PCOS (Y/N)"].index
total_vars = largest_vars.append(smallest_vars)
sns.heatmap(df_PCOS_all.drop(columns_to_drop, axis=1)[total_vars].corr(), \
cmap = "Pastel1")
plt.title('10 largest and 5 smallest PCOS correlated quantities');
df_PCOS_numeric = df_PCOS.select_dtypes(exclude = 'bool')
df_PCOS_numeric = df_PCOS_numeric.drop(columns_to_drop, axis = 1)
df_no_PCOS_numeric = df_no_PCOS.select_dtypes(exclude = 'bool')
df_no_PCOS_numeric = df_no_PCOS_numeric.drop(columns_to_drop, axis = 1)
variable_dropdown = Dropdown(options = df_PCOS_numeric.columns)
output_plot = Output()
def make_histograms(*args):
output_plot.clear_output(wait = True)
with output_plot:
fig, ax1 = plt.subplots()
sns.histplot(x = variable_dropdown.value, data = df_PCOS, kde = True, ax = ax1 )
sns.histplot(x = variable_dropdown.value, data = df_no_PCOS, kde = True, ax = ax1)
#ax1.get_legend()
#legend = ax1.get_legend()
#handles = legend.legend_handles
#ax1.legend(handles, ['PCOS True', 'PCOS False'], title='Stat.ind.')
plt.show()
variable_dropdown.observe(make_histograms, names = 'value')
display(variable_dropdown, output_plot)
Dropdown(options=(' Age (yrs)', 'Weight (Kg)', 'Height(Cm) ', 'BMI', 'Pulse rate(bpm) ', 'RR (breaths/min)', '…
Output()
Among all the plots we can look at above the plots related to left and right follicle measurements are distinctly different. Let’s look at them in a bit more detail.
fig, ax = plt.subplots(2,2, figsize = (8,8))
sns.violinplot(df_PCOS_all, x = 'PCOS (Y/N)', y = 'Avg. F size (L) (mm)', ax = ax[0][0])
sns.violinplot(df_PCOS_all, x = 'PCOS (Y/N)', y = 'Avg. F size (R) (mm)', ax = ax[0][1])
sns.violinplot(df_PCOS_all, x = 'PCOS (Y/N)', y = 'Follicle No. (L)', ax = ax[1][0])
sns.violinplot(df_PCOS_all, x = 'PCOS (Y/N)', y = 'Follicle No. (R)', ax = ax[1][1])
<Axes: xlabel='PCOS (Y/N)', ylabel='Follicle No. (R)'>
My main goal is to be able to understand which factors are correlated with PCOS. These are not ‘causes’ of PCOS but rather symtoms which can help diagnose the condition. Note that correlations within binary data are also computed in .corr() method of pandas. Here I evaluate them explicitly.
Cramér’s V coefficient is used for evaluating correlation between two variables. The variables may have more than binary categories. For variables A, B if
$n_{i,j}$ = number of times $A_i, B_j$ were observed,
$\chi^2 = \sum_{i,j}\frac{(n_{i,j} - {\frac{n_i, n_j}{n}})}{\frac{n_i, n_j}{n}}$
where $n_i = \sum_j n_{i,j}$ is the number of times the value $A_i$ is observed and $n_i = \sum_j n_{i,j}$ is the number of times the value $B_j$ is observed.
The coefficient is the computed as
$V = \sqrt\frac{\chi^2/n}{min(k-1)(r-1)}$
where $k$ = number of columns, $r$ = number of rows, $n$ = total number of observations
from scipy.stats import chi2_contingency
from tabulate import tabulate
import numpy as np
df_PCOS_bool = df_PCOS_all.select_dtypes(include = 'bool')
results = []
for col in df_PCOS_bool.columns:
if col == 'PCOS (Y/N)': continue
pearson_corr = df_PCOS_bool['PCOS (Y/N)'].astype(int).corr(df_PCOS_bool[col].astype(int))
# Now let's compute the Cramér's V coefficient
contigency_table = pd.crosstab(df_PCOS_bool['PCOS (Y/N)'].astype(int), df_PCOS_bool[col].astype(int))
print('----------------------------------')
print(contigency_table )
chi2, _, _, _ = chi2_contingency(contigency_table)
n = contigency_table.values.sum()
cramers_v = np.sqrt(chi2/n)
results.append([col, pearson_corr, cramers_v])
print('----------------------------------')
print(tabulate(results, headers = ['criteria', 'Pearsons coefficient', 'cramers_v']))
----------------------------------
Pregnant(Y/N) 0 1
PCOS (Y/N)
0 221 142
1 113 64
----------------------------------
Weight gain(Y/N) 0 1
PCOS (Y/N)
0 280 83
1 56 121
----------------------------------
hair growth(Y/N) 0 1
PCOS (Y/N)
0 316 47
1 76 101
----------------------------------
Skin darkening (Y/N) 0 1
PCOS (Y/N)
0 307 56
1 67 110
----------------------------------
Hair loss(Y/N) 0 1
PCOS (Y/N)
0 220 143
1 75 102
----------------------------------
Pimples(Y/N) 0 1
PCOS (Y/N)
0 222 141
1 54 123
----------------------------------
Fast food (Y/N) 0 1
PCOS (Y/N)
0 223 140
1 38 139
----------------------------------
Reg.Exercise(Y/N) 0 1
PCOS (Y/N)
0 281 82
1 126 51
----------------------------------
criteria Pearsons coefficient cramers_v
-------------------- ---------------------- -----------
Pregnant(Y/N) -0.0286064 0.0245456
Weight gain(Y/N) 0.440488 0.436419
hair growth(Y/N) 0.464245 0.459823
Skin darkening (Y/N) 0.475283 0.471008
Hair loss(Y/N) 0.171913 0.167951
Pimples(Y/N) 0.287802 0.283856
Fast food (Y/N) 0.375389 0.371442
Reg.Exercise(Y/N) 0.0678092 0.0632309
It seems that hair growth, skin darknin and hair loss are correlated with PCOS diagnosis. Fast food seems to have a correlation but I wonder if that is an artefact.
I will use a random forrest classifier here because there is no specific knowledge to choose other classifiers. In addition, the data consists of both integers and boolaen types. I believe that random forest classifier should work.
$precision = \frac{TP}{TP+FP}$
$recall = \frac{TP}{FP+FN}$
$F_1 score = \frac{precision \times recall}{precision + recall}$
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve
from sklearn.preprocessing import StandardScaler
columns_to_convert = df_PCOS_all.filter(like = 'Y/N').columns
df_PCOS_all[columns_to_convert] = df_PCOS_all[columns_to_convert].astype('int')
label_col = "PCOS (Y/N)"
X = df_PCOS_all.drop(columns = [label_col])
X = X.drop(columns = columns_to_drop, axis = 1)
y = df_PCOS_all[label_col]
float_columns = X.select_dtypes(include = ['float64']).columns
scaler=StandardScaler()
X[float_columns] = scaler.fit_transform(X[float_columns])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy is {accuracy}')
print('\n Classification report is')
print(classification_report(y_test, y_pred))
# Optional: Display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\n Confusion Matrix:")
print('\n'.join(['\t'.join([str(cell) for cell in row]) for row in conf_matrix]))
Accuracy is 0.8888888888888888
Classification report is
precision recall f1-score support
0 0.87 0.97 0.92 71
1 0.93 0.73 0.82 37
accuracy 0.89 108
macro avg 0.90 0.85 0.87 108
weighted avg 0.89 0.89 0.89 108
Confusion Matrix:
69 2
10 27
The above scores don’t look too bad.
Let us try to make a precisio vs recall curve. The following curve doesn’t look right, so I have to do something to fix it.
y_scores = cross_val_predict(clf, X_train, y_train)#, method = 'decision_function')
precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores)
print(precisions,recalls, thresholds)
plt.plot(thresholds, precisions[:-1])
plt.plot(thresholds, recalls[:-1])
plt.scatter(thresholds, precisions[:-1])
plt.scatter(thresholds, recalls[:-1])
[0.3287037 0.89430894 1. ] [1. 0.77464789 0. ] [0 1]
<matplotlib.collections.PathCollection at 0x15f8de990>
see https://towardsdatascience.com/hypothesis-testing-with-python-step-by-step-hands-on-tutorial-with-practical-examples-e805975ea96e for hypothesis testing