import pandas as pd
import numpy as np
data = pd.read_csv('./loan_data.csv' )
train_data = data.sample(frac= 0.8 )
test_data = data.drop(train_data.index)
discrete_features = ['person_home_ownership' ,
'person_gender' ,
'person_education' ,
'loan_intent' ,
'previous_loan_defaults_on_file' ,
'loan_status' ]
continuous_features = ['person_age' ,
'person_income' ,
'person_emp_exp' ,
'loan_amnt' ,
'loan_int_rate' ,
'loan_percent_income' ,
'cb_person_cred_hist_length' ,
'credit_score' ]
train_data[discrete_features].head(5 )
17004
OWN
male
Bachelor
EDUCATION
Yes
0
23005
RENT
male
High School
EDUCATION
No
1
42571
RENT
female
High School
MEDICAL
No
1
33151
MORTGAGE
male
Bachelor
DEBTCONSOLIDATION
Yes
0
25229
MORTGAGE
female
Associate
DEBTCONSOLIDATION
No
0
train_data[continuous_features].head(5 )
17004
22.0
66454.0
0
8000.0
11.01
0.12
2.0
679
23005
30.0
43541.0
10
8000.0
14.59
0.18
9.0
644
42571
35.0
69434.0
12
6173.0
11.05
0.09
10.0
476
33151
30.0
83584.0
4
20000.0
9.41
0.24
6.0
665
25229
33.0
98797.0
10
9000.0
8.00
0.09
8.0
708
approved_df = train_data[train_data['loan_status' ] == 1 ]
rejected_df = train_data[train_data['loan_status' ] == 0 ]
approved_discrete_features = {feature: approved_df[feature].value_counts(normalize= True ) \
for feature in discrete_features}
rejected_discrete_features = {feature: rejected_df[feature].value_counts(normalize= True ) \
for feature in discrete_features}
approved_continuous_features = {feature:(approved_df[feature].mean(), approved_df[feature].std()) \
for feature in continuous_features}
rejected_continuous_features = {feature:(rejected_df[feature].mean(), rejected_df[feature].std()) \
for feature in continuous_features}
normal_distribution = lambda x, mean, std: 1 / std/ np.sqrt(2 * np.pi)* np.exp(- 0.5 * (x- mean)** 2 / std** 2 )
for idx, sample_data in test_data.iterrows():
likelihood_approved = 0
likelihood_rejected = 0
for feature in discrete_features:
if feature != 'loan_status' :
try :
likelihood_approved += np.log2(approved_discrete_features[feature][sample_data[feature]])
except :
likelihood_approved += - 300
try :
likelihood_rejected += np.log2(rejected_discrete_features[feature][sample_data[feature]])
except :
likelihood_rejected += - 300
for feature in continuous_features:
(mean, stdev) = approved_continuous_features[feature]
likelihood_approved += np.log2(normal_distribution(sample_data[feature], mean, stdev))
(mean, stdev) = rejected_continuous_features[feature]
likelihood_rejected += np.log2(normal_distribution(sample_data[feature], mean, stdev))
likelihood_approved += np.log2(len (approved_df) / len (train_data))
likelihood_rejected += np.log2(len (rejected_df) / len (train_data))
if likelihood_approved > likelihood_rejected:
test_data.loc[idx, 'predicted_loan_status' ] = 1
else :
test_data.loc[idx, 'predicted_loan_status' ] = 0
### out of the positive predictions, how many were correct
precision = (test_data.loan_status == test_data.predicted_loan_status).sum ()\
/ len (test_data)
### percentage of positives that are correctly predicted
sensitivity = (test_data[(test_data.loan_status == 1 ) & (test_data.predicted_loan_status == 1 )].shape[0 ]\
/ test_data[test_data.loan_status == 1 ].shape[0 ])
### percentage of negatives that are correctly predicted
specificity = (test_data[(test_data.loan_status == 0 ) & (test_data.predicted_loan_status == 0 )].shape[0 ]\
/ test_data[test_data.loan_status == 0 ].shape[0 ])
print (f"Precision: { precision:.4f} " )
print (f"Sensitivity: { sensitivity:.4f} " )
print (f"Specificity: { specificity:.4f} " )
Precision: 0.8829
Sensitivity: 0.7763
Specificity: 0.9145