Data Cleaning, EDA, and Statistical Modeling Workbook¶

Introduction¶

This notebook walks through a full data analysis pipeline — from raw data ingestion and encoding detection to exploratory data analysis (EDA), statistical modeling, and model performance evaluation. Key tools include chardet and codecs for encoding detection, pandas and skimpy for data exploration, statsmodels for linear and logistic regression, and scikit-learn for evaluation metrics such as confusion matrices and accuracy scores.

In [ ]:

Copied!

# Install dependencies, if needed
!pip install skimpy
# Install dependencies, if needed
!pip install skimpy

In [2]:

Copied!

import chardet
import pandas as pd

# Read the CSV file using the detected encoding and specifying the delimiter

url = "https://docs.google.com/spreadsheets/d/10L8BpkV4q1Zsou4daYoWul_8PFA9rsv2/export?format=csv&id=10L8BpkV4q1Zsou4daYoWul_8PFA9rsv2&gid=1710894028"
df = pd.read_csv(url, index_col=False)

df.head()
import chardet
import pandas as pd

# Read the CSV file using the detected encoding and specifying the delimiter

url = "https://docs.google.com/spreadsheets/d/10L8BpkV4q1Zsou4daYoWul_8PFA9rsv2/export?format=csv&id=10L8BpkV4q1Zsou4daYoWul_8PFA9rsv2&gid=1710894028"
df = pd.read_csv(url, index_col=False)

df.head()

Out[2]:

	installment	log_income	fico_score	rev_balance	inquiries
0	829	4.93	737	28.85	0
1	228	4.81	707	33.62	0
2	367	4.51	682	3.51	1
3	162	4.93	712	33.67	1
4	103	4.91	667	4.74	0

In [3]:

Copied!





# imports packages to be used in the code
import numpy as np
import codecs
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import sklearn as skl
from statsmodels.formula.api import ols
from statsmodels.formula.api import logit
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from skimpy import skim

print(skl.__version__)

df
# imports packages to be used in the code
import numpy as np
import codecs
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import sklearn as skl
from statsmodels.formula.api import ols
from statsmodels.formula.api import logit
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from skimpy import skim

print(skl.__version__)

df

1.6.1

Out[3]:

	default	installment	log_income	fico_score	rev_balance	inquiries	records
0	0	829	4.93	737	28.85	0	0
1	0	228	4.81	707	33.62	0	0
2	0	367	4.51	682	3.51	1	0
3	0	162	4.93	712	33.67	1	0
4	0	103	4.91	667	4.74	0	0
...	...	...	...	...	...	...	...
9511	1	345	5.29	672	215.37	2	0
9512	1	258	4.84	722	0.18	5	0
9513	1	98	4.60	687	10.04	8	0
9514	1	352	4.70	692	0.00	5	0
9515	1	853	4.89	732	37.88	6	0

9516 rows × 7 columns

In [4]:

Copied!

# shows first 6 rows of dataframe
df.head(6)
# shows first 6 rows of dataframe
df.head(6)

Out[4]:

	installment	log_income	fico_score	rev_balance	inquiries
0	829	4.93	737	28.85	0
1	228	4.81	707	33.62	0
2	367	4.51	682	3.51	1
3	162	4.93	712	33.67	1
4	103	4.91	667	4.74	0
5	125	5.17	727	50.81	0

In [5]:

Copied!

df.tail(6)
df.tail(6)

Out[5]:

	default	installment	log_income	fico_score	rev_balance	inquiries
9510	1	70	4.39	662	3.00	6
9511	1	345	5.29	672	215.37	2
9512	1	258	4.84	722	0.18	5
9513	1	98	4.60	687	10.04	8
9514	1	352	4.70	692	0.00	5
9515	1	853	4.89	732	37.88	6

In [6]:

Copied!

df = df[["default", "fico_score"]]
df = df[["default", "fico_score"]]

In [7]:

Copied!

df
df

Out[7]:

	default	fico_score
0	0	737
1	0	707
2	0	682
3	0	712
4	0	667
...	...	...
9511	1	672
9512	1	722
9513	1	687
9514	1	692
9515	1	732

9516 rows × 2 columns

In [8]:

Copied!

skim(df)
skim(df)

╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮
│          Data Summary                Data Types                                                                 │
│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓                                                          │
│ ┃ Dataframe         ┃ Values ┃ ┃ Column Type ┃ Count ┃                                                          │
│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩                                                          │
│ │ Number of rows    │ 9516   │ │ int64       │ 2     │                                                          │
│ │ Number of columns │ 2      │ └─────────────┴───────┘                                                          │
│ └───────────────────┴────────┘                                                                                  │
│                                                     number                                                      │
│ ┏━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━┓  │
│ ┃ column          ┃ NA   ┃ NA %    ┃ mean      ┃ sd        ┃ p0    ┃ p25   ┃ p50   ┃ p75   ┃ p100  ┃ hist    ┃  │
│ ┡━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━┩  │
│ │ default         │    0 │       0 │    0.1598 │    0.3665 │     0 │     0 │     0 │     0 │     1 │ █    ▂  │  │
│ │ fico_score      │    0 │       0 │     710.8 │     37.96 │   612 │   682 │   707 │   737 │   827 │ ▁▆█▅▃▁  │  │
│ └─────────────────┴──────┴─────────┴───────────┴───────────┴───────┴───────┴───────┴───────┴───────┴─────────┘  │
╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯

In [9]:

Copied!





# plot target variable
# plt.scatter(df['default'], df['inquiries'],alpha =0.0, s=200)
# df = pd.read_excel('loans.xlsx')
plt.scatter(df["default"], df["fico_score"], alpha=1.0, s=200)
plt.title("fico vs. default")
plt.xlabel("default")
plt.ylabel("fico")
# plot target variable
# plt.scatter(df['default'], df['inquiries'],alpha =0.0, s=200)
# df = pd.read_excel('loans.xlsx')
plt.scatter(df["default"], df["fico_score"], alpha=1.0, s=200)
plt.title("fico vs. default")
plt.xlabel("default")
plt.ylabel("fico")

Out[9]:

Text(0, 0.5, 'fico')

No description has been provided for this image

In [10]:

Copied!

# data split into 70% train and 30% test
df_train, df_test = train_test_split(df, test_size=0.3)
# data split into 70% train and 30% test
df_train, df_test = train_test_split(df, test_size=0.3)

In [11]:

Copied!





print(df_train)
# Save the DataFrame to a CSV file
# Replace 'file_path' with the path where you want to save the file
file_path = "df_train.csv"
df_train.to_csv(file_path, index=False)
print(df_train)
# Save the DataFrame to a CSV file
# Replace 'file_path' with the path where you want to save the file
file_path = "df_train.csv"
df_train.to_csv(file_path, index=False)

      default  fico_score
578         0         717
6479        0         727
2787        0         677
1931        0         697
7539        0         687
...       ...         ...
7505        0         672
5436        0         667
7359        0         742
9056        1         702
8920        0         672

[6661 rows x 2 columns]

In [12]:

Copied!

skim(df_train)
skim(df_train)

╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮
│          Data Summary                Data Types                                                                 │
│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓                                                          │
│ ┃ Dataframe         ┃ Values ┃ ┃ Column Type ┃ Count ┃                                                          │
│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩                                                          │
│ │ Number of rows    │ 6661   │ │ int64       │ 2     │                                                          │
│ │ Number of columns │ 2      │ └─────────────┴───────┘                                                          │
│ └───────────────────┴────────┘                                                                                  │
│                                                     number                                                      │
│ ┏━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━┓  │
│ ┃ column          ┃ NA   ┃ NA %    ┃ mean      ┃ sd        ┃ p0    ┃ p25   ┃ p50   ┃ p75   ┃ p100  ┃ hist    ┃  │
│ ┡━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━┩  │
│ │ default         │    0 │       0 │    0.1599 │    0.3665 │     0 │     0 │     0 │     0 │     1 │ █    ▂  │  │
│ │ fico_score      │    0 │       0 │     710.8 │     37.98 │   612 │   682 │   707 │   737 │   827 │ ▁▆█▆▃▁  │  │
│ └─────────────────┴──────┴─────────┴───────────┴───────────┴───────┴───────┴───────┴───────┴───────┴─────────┘  │
╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯

In [13]:

Copied!

skim(df_test)
skim(df_test)

╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮
│          Data Summary                Data Types                                                                 │
│ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓                                                          │
│ ┃ Dataframe         ┃ Values ┃ ┃ Column Type ┃ Count ┃                                                          │
│ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩                                                          │
│ │ Number of rows    │ 2855   │ │ int64       │ 2     │                                                          │
│ │ Number of columns │ 2      │ └─────────────┴───────┘                                                          │
│ └───────────────────┴────────┘                                                                                  │
│                                                     number                                                      │
│ ┏━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━━━┓  │
│ ┃ column          ┃ NA   ┃ NA %    ┃ mean      ┃ sd        ┃ p0    ┃ p25   ┃ p50   ┃ p75   ┃ p100  ┃ hist    ┃  │
│ ┡━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━━━┩  │
│ │ default         │    0 │       0 │    0.1597 │    0.3664 │     0 │     0 │     0 │     0 │     1 │ █    ▂  │  │
│ │ fico_score      │    0 │       0 │       711 │     37.91 │   627 │   682 │   707 │   737 │   822 │ ▁▇█▅▃▁  │  │
│ └─────────────────┴──────┴─────────┴───────────┴───────────┴───────┴───────┴───────┴───────┴───────┴─────────┘  │
╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯

In [14]:

Copied!





# build formula,  target (dependent variable) ~ features (independent variables)
# build model, fit the formula to the training data using a logistic algorithm (logit)
est = logit(formula="default ~ fico_score", data=df_train).fit()

# print the results of the model (est).
# Examine; Pseudo R-square

print(est.summary())
# build formula,  target (dependent variable) ~ features (independent variables)
# build model, fit the formula to the training data using a logistic algorithm (logit)
est = logit(formula="default ~ fico_score", data=df_train).fit()

# print the results of the model (est).
# Examine; Pseudo R-square

print(est.summary())

Optimization terminated successfully.
         Current function value: 0.428098
         Iterations 6
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                default   No. Observations:                 6661
Model:                          Logit   Df Residuals:                     6659
Method:                           MLE   Df Model:                            1
Date:                Fri, 30 May 2025   Pseudo R-squ.:                 0.02590
Time:                        12:45:42   Log-Likelihood:                -2851.6
converged:                       True   LL-Null:                       -2927.4
Covariance Type:            nonrobust   LLR p-value:                 7.607e-35
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      6.5097      0.688      9.455      0.000       5.160       7.859
fico_score    -0.0116      0.001    -11.792      0.000      -0.014      -0.010
==============================================================================

In [15]:

Copied!





# apply the model (est) to the test data and make predictions
preds = est.predict(df_test)
df_test["predicted_probability"] = preds

# print top 6 predicted probabilities
df_test.head(6)
# apply the model (est) to the test data and make predictions
preds = est.predict(df_test)
df_test["predicted_probability"] = preds

# print top 6 predicted probabilities
df_test.head(6)

Out[15]:

	default	fico_score	predicted_probability
1393	1	692	0.181684
3933	0	672	0.218686
5889	0	707	0.157267
9452	0	702	0.165095
5003	0	697	0.173233
8064	1	702	0.165095

In [16]:

Copied!





# test for 'predicted_probability > 0.5, if yes assign will_default to 1, otherwise to 0
df_test["will_default"] = np.where(df_test["predicted_probability"] > 0.25, 1, 0)
df_test.head(6)
print(df_test)
# test for 'predicted_probability > 0.5, if yes assign will_default to 1, otherwise to 0
df_test["will_default"] = np.where(df_test["predicted_probability"] > 0.25, 1, 0)
df_test.head(6)
print(df_test)

      default  fico_score  predicted_probability  will_default
1393        1         692               0.181684             0
3933        0         672               0.218686             0
5889        0         707               0.157267             0
9452        0         702               0.165095             0
5003        0         697               0.173233             0
...       ...         ...                    ...           ...
8651        0         677               0.208953             0
2222        0         672               0.218686             0
7909        1         657               0.249813             0
1047        0         742               0.110654             0
6932        0         732               0.122576             0

[2855 rows x 4 columns]

Confusion Matrix¶

In [17]:

Copied!





# Evaluation Metrics
# print confusion matrix with labels

# Plot the confusion matrix with the custom Seaborn-based colormap
disp = ConfusionMatrixDisplay.from_predictions(
    df_test["default"],
    df_test["will_default"],
    display_labels=["No Default", "Default"],
    cmap="Blues",
)

plt.title("Confusion Matrix")
plt.show()

# print accuracy
print("Accuracy: " + str(accuracy_score(df_test["default"], df_test["will_default"])))
# Evaluation Metrics
# print confusion matrix with labels

# Plot the confusion matrix with the custom Seaborn-based colormap
disp = ConfusionMatrixDisplay.from_predictions(
    df_test["default"],
    df_test["will_default"],
    display_labels=["No Default", "Default"],
    cmap="Blues",
)

plt.title("Confusion Matrix")
plt.show()

# print accuracy
print("Accuracy: " + str(accuracy_score(df_test["default"], df_test["will_default"])))

Accuracy: 0.8217162872154116

In [18]:

Copied!





matrix = confusion_matrix(df_test["default"], df_test["will_default"])

# Normalize the matrix to get percentages
normalized_matrix = matrix / np.sum(matrix)

# Create label overlay
labels = ["True Neg", "False Pos", "False Neg", "True Pos"]
labels = np.asarray(labels).reshape(2, 2)

# Format labels with percentages + class names
annot = np.empty_like(labels, dtype=object)
for i in range(2):
    for j in range(2):
        annot[i, j] = f"{labels[i, j]}\n{normalized_matrix[i, j]:.2%}"

# Plot heatmap with combined labels and percentages
sns.heatmap(normalized_matrix, annot=annot, fmt="", cmap="Blues", cbar=False)

plt.title("Confusion Matrix with Custom Labels")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.xticks([0.5, 1.5], ["No Default", "Default"])
plt.yticks([0.5, 1.5], ["No Default", "Default"], rotation=0)
plt.show()
matrix = confusion_matrix(df_test["default"], df_test["will_default"])

# Normalize the matrix to get percentages
normalized_matrix = matrix / np.sum(matrix)

# Create label overlay
labels = ["True Neg", "False Pos", "False Neg", "True Pos"]
labels = np.asarray(labels).reshape(2, 2)

# Format labels with percentages + class names
annot = np.empty_like(labels, dtype=object)
for i in range(2):
    for j in range(2):
        annot[i, j] = f"{labels[i, j]}\n{normalized_matrix[i, j]:.2%}"

# Plot heatmap with combined labels and percentages
sns.heatmap(normalized_matrix, annot=annot, fmt="", cmap="Blues", cbar=False)

plt.title("Confusion Matrix with Custom Labels")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.xticks([0.5, 1.5], ["No Default", "Default"])
plt.yticks([0.5, 1.5], ["No Default", "Default"], rotation=0)
plt.show()