Data Cleaning, EDA, and Statistical Modeling Workbook¶
Introduction¶
This notebook walks through a full data analysis pipeline ā from raw data ingestion and encoding detection to exploratory data analysis (EDA), statistical modeling, and model performance evaluation. Key tools include chardet
and codecs
for encoding detection, pandas
and skimpy
for data exploration, statsmodels
for linear and logistic regression, and scikit-learn
for evaluation metrics such as confusion matrices and accuracy scores.
InĀ [Ā ]:
Copied!
# Install dependencies, if needed
!pip install skimpy
# Install dependencies, if needed
!pip install skimpy
InĀ [2]:
Copied!
import chardet
import pandas as pd
# Read the CSV file using the detected encoding and specifying the delimiter
url = "https://docs.google.com/spreadsheets/d/10L8BpkV4q1Zsou4daYoWul_8PFA9rsv2/export?format=csv&id=10L8BpkV4q1Zsou4daYoWul_8PFA9rsv2&gid=1710894028"
df = pd.read_csv(url, index_col=False)
df.head()
import chardet
import pandas as pd
# Read the CSV file using the detected encoding and specifying the delimiter
url = "https://docs.google.com/spreadsheets/d/10L8BpkV4q1Zsou4daYoWul_8PFA9rsv2/export?format=csv&id=10L8BpkV4q1Zsou4daYoWul_8PFA9rsv2&gid=1710894028"
df = pd.read_csv(url, index_col=False)
df.head()
Out[2]:
default | installment | log_income | fico_score | rev_balance | inquiries | records | |
---|---|---|---|---|---|---|---|
0 | 0 | 829 | 4.93 | 737 | 28.85 | 0 | 0 |
1 | 0 | 228 | 4.81 | 707 | 33.62 | 0 | 0 |
2 | 0 | 367 | 4.51 | 682 | 3.51 | 1 | 0 |
3 | 0 | 162 | 4.93 | 712 | 33.67 | 1 | 0 |
4 | 0 | 103 | 4.91 | 667 | 4.74 | 0 | 0 |
InĀ [3]:
Copied!
# imports packages to be used in the code
import numpy as np
import codecs
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import sklearn as skl
from statsmodels.formula.api import ols
from statsmodels.formula.api import logit
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from skimpy import skim
print(skl.__version__)
df
# imports packages to be used in the code
import numpy as np
import codecs
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import sklearn as skl
from statsmodels.formula.api import ols
from statsmodels.formula.api import logit
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from skimpy import skim
print(skl.__version__)
df
1.6.1
Out[3]:
default | installment | log_income | fico_score | rev_balance | inquiries | records | |
---|---|---|---|---|---|---|---|
0 | 0 | 829 | 4.93 | 737 | 28.85 | 0 | 0 |
1 | 0 | 228 | 4.81 | 707 | 33.62 | 0 | 0 |
2 | 0 | 367 | 4.51 | 682 | 3.51 | 1 | 0 |
3 | 0 | 162 | 4.93 | 712 | 33.67 | 1 | 0 |
4 | 0 | 103 | 4.91 | 667 | 4.74 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... |
9511 | 1 | 345 | 5.29 | 672 | 215.37 | 2 | 0 |
9512 | 1 | 258 | 4.84 | 722 | 0.18 | 5 | 0 |
9513 | 1 | 98 | 4.60 | 687 | 10.04 | 8 | 0 |
9514 | 1 | 352 | 4.70 | 692 | 0.00 | 5 | 0 |
9515 | 1 | 853 | 4.89 | 732 | 37.88 | 6 | 0 |
9516 rows Ć 7 columns
InĀ [4]:
Copied!
# shows first 6 rows of dataframe
df.head(6)
# shows first 6 rows of dataframe
df.head(6)
Out[4]:
default | installment | log_income | fico_score | rev_balance | inquiries | records | |
---|---|---|---|---|---|---|---|
0 | 0 | 829 | 4.93 | 737 | 28.85 | 0 | 0 |
1 | 0 | 228 | 4.81 | 707 | 33.62 | 0 | 0 |
2 | 0 | 367 | 4.51 | 682 | 3.51 | 1 | 0 |
3 | 0 | 162 | 4.93 | 712 | 33.67 | 1 | 0 |
4 | 0 | 103 | 4.91 | 667 | 4.74 | 0 | 0 |
5 | 0 | 125 | 5.17 | 727 | 50.81 | 0 | 0 |
InĀ [5]:
Copied!
df.tail(6)
df.tail(6)
Out[5]:
default | installment | log_income | fico_score | rev_balance | inquiries | records | |
---|---|---|---|---|---|---|---|
9510 | 1 | 70 | 4.39 | 662 | 3.00 | 6 | 0 |
9511 | 1 | 345 | 5.29 | 672 | 215.37 | 2 | 0 |
9512 | 1 | 258 | 4.84 | 722 | 0.18 | 5 | 0 |
9513 | 1 | 98 | 4.60 | 687 | 10.04 | 8 | 0 |
9514 | 1 | 352 | 4.70 | 692 | 0.00 | 5 | 0 |
9515 | 1 | 853 | 4.89 | 732 | 37.88 | 6 | 0 |
InĀ [6]:
Copied!
df = df[["default", "fico_score"]]
df = df[["default", "fico_score"]]
InĀ [7]:
Copied!
df
df
Out[7]:
default | fico_score | |
---|---|---|
0 | 0 | 737 |
1 | 0 | 707 |
2 | 0 | 682 |
3 | 0 | 712 |
4 | 0 | 667 |
... | ... | ... |
9511 | 1 | 672 |
9512 | 1 | 722 |
9513 | 1 | 687 |
9514 | 1 | 692 |
9515 | 1 | 732 |
9516 rows Ć 2 columns
InĀ [8]:
Copied!
skim(df)
skim(df)
āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā skimpy summary āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā® ā Data Summary Data Types ā ā āāāāāāāāāāāāāāāāāāāāā³āāāāāāāāā āāāāāāāāāāāāāāā³āāāāāāāā ā ā ā Dataframe ā Values ā ā Column Type ā Count ā ā ā ā”āāāāāāāāāāāāāāāāāāāāāāāāāāāāā© ā”āāāāāāāāāāāāāāāāāāāāāā© ā ā ā Number of rows ā 9516 ā ā int64 ā 2 ā ā ā ā Number of columns ā 2 ā āāāāāāāāāāāāāāā“āāāāāāāā ā ā āāāāāāāāāāāāāāāāāāāāā“āāāāāāāāā ā ā number ā ā āāāāāāāāāāāāāāāāāāā³āāāāāāā³āāāāāāāāāā³āāāāāāāāāāāā³āāāāāāāāāāāā³āāāāāāāā³āāāāāāāā³āāāāāāāā³āāāāāāāā³āāāāāāāā³āāāāāāāāāā ā ā ā column ā NA ā NA % ā mean ā sd ā p0 ā p25 ā p50 ā p75 ā p100 ā hist ā ā ā ā”āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā© ā ā ā default ā 0 ā 0 ā 0.1598 ā 0.3665 ā 0 ā 0 ā 0 ā 0 ā 1 ā ā ā ā ā ā ā fico_score ā 0 ā 0 ā 710.8 ā 37.96 ā 612 ā 682 ā 707 ā 737 ā 827 ā āāāā āā ā ā ā āāāāāāāāāāāāāāāāāāā“āāāāāāā“āāāāāāāāāā“āāāāāāāāāāāā“āāāāāāāāāāāā“āāāāāāāā“āāāāāāāā“āāāāāāāā“āāāāāāāā“āāāāāāāā“āāāāāāāāāā ā ā°āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā End āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāÆ
InĀ [9]:
Copied!
# plot target variable
# plt.scatter(df['default'], df['inquiries'],alpha =0.0, s=200)
# df = pd.read_excel('loans.xlsx')
plt.scatter(df["default"], df["fico_score"], alpha=1.0, s=200)
plt.title("fico vs. default")
plt.xlabel("default")
plt.ylabel("fico")
# plot target variable
# plt.scatter(df['default'], df['inquiries'],alpha =0.0, s=200)
# df = pd.read_excel('loans.xlsx')
plt.scatter(df["default"], df["fico_score"], alpha=1.0, s=200)
plt.title("fico vs. default")
plt.xlabel("default")
plt.ylabel("fico")
Out[9]:
Text(0, 0.5, 'fico')
InĀ [10]:
Copied!
# data split into 70% train and 30% test
df_train, df_test = train_test_split(df, test_size=0.3)
# data split into 70% train and 30% test
df_train, df_test = train_test_split(df, test_size=0.3)
InĀ [11]:
Copied!
print(df_train)
# Save the DataFrame to a CSV file
# Replace 'file_path' with the path where you want to save the file
file_path = "df_train.csv"
df_train.to_csv(file_path, index=False)
print(df_train)
# Save the DataFrame to a CSV file
# Replace 'file_path' with the path where you want to save the file
file_path = "df_train.csv"
df_train.to_csv(file_path, index=False)
default fico_score 578 0 717 6479 0 727 2787 0 677 1931 0 697 7539 0 687 ... ... ... 7505 0 672 5436 0 667 7359 0 742 9056 1 702 8920 0 672 [6661 rows x 2 columns]
InĀ [12]:
Copied!
skim(df_train)
skim(df_train)
āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā skimpy summary āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā® ā Data Summary Data Types ā ā āāāāāāāāāāāāāāāāāāāāā³āāāāāāāāā āāāāāāāāāāāāāāā³āāāāāāāā ā ā ā Dataframe ā Values ā ā Column Type ā Count ā ā ā ā”āāāāāāāāāāāāāāāāāāāāāāāāāāāāā© ā”āāāāāāāāāāāāāāāāāāāāāā© ā ā ā Number of rows ā 6661 ā ā int64 ā 2 ā ā ā ā Number of columns ā 2 ā āāāāāāāāāāāāāāā“āāāāāāāā ā ā āāāāāāāāāāāāāāāāāāāāā“āāāāāāāāā ā ā number ā ā āāāāāāāāāāāāāāāāāāā³āāāāāāā³āāāāāāāāāā³āāāāāāāāāāāā³āāāāāāāāāāāā³āāāāāāāā³āāāāāāāā³āāāāāāāā³āāāāāāāā³āāāāāāāā³āāāāāāāāāā ā ā ā column ā NA ā NA % ā mean ā sd ā p0 ā p25 ā p50 ā p75 ā p100 ā hist ā ā ā ā”āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā© ā ā ā default ā 0 ā 0 ā 0.1599 ā 0.3665 ā 0 ā 0 ā 0 ā 0 ā 1 ā ā ā ā ā ā ā fico_score ā 0 ā 0 ā 710.8 ā 37.98 ā 612 ā 682 ā 707 ā 737 ā 827 ā āāāāāā ā ā ā āāāāāāāāāāāāāāāāāāā“āāāāāāā“āāāāāāāāāā“āāāāāāāāāāāā“āāāāāāāāāāāā“āāāāāāāā“āāāāāāāā“āāāāāāāā“āāāāāāāā“āāāāāāāā“āāāāāāāāāā ā ā°āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā End āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāÆ
InĀ [13]:
Copied!
skim(df_test)
skim(df_test)
āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā skimpy summary āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā® ā Data Summary Data Types ā ā āāāāāāāāāāāāāāāāāāāāā³āāāāāāāāā āāāāāāāāāāāāāāā³āāāāāāāā ā ā ā Dataframe ā Values ā ā Column Type ā Count ā ā ā ā”āāāāāāāāāāāāāāāāāāāāāāāāāāāāā© ā”āāāāāāāāāāāāāāāāāāāāāā© ā ā ā Number of rows ā 2855 ā ā int64 ā 2 ā ā ā ā Number of columns ā 2 ā āāāāāāāāāāāāāāā“āāāāāāāā ā ā āāāāāāāāāāāāāāāāāāāāā“āāāāāāāāā ā ā number ā ā āāāāāāāāāāāāāāāāāāā³āāāāāāā³āāāāāāāāāā³āāāāāāāāāāāā³āāāāāāāāāāāā³āāāāāāāā³āāāāāāāā³āāāāāāāā³āāāāāāāā³āāāāāāāā³āāāāāāāāāā ā ā ā column ā NA ā NA % ā mean ā sd ā p0 ā p25 ā p50 ā p75 ā p100 ā hist ā ā ā ā”āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā© ā ā ā default ā 0 ā 0 ā 0.1597 ā 0.3664 ā 0 ā 0 ā 0 ā 0 ā 1 ā ā ā ā ā ā ā fico_score ā 0 ā 0 ā 711 ā 37.91 ā 627 ā 682 ā 707 ā 737 ā 822 ā āāāā āā ā ā ā āāāāāāāāāāāāāāāāāāā“āāāāāāā“āāāāāāāāāā“āāāāāāāāāāāā“āāāāāāāāāāāā“āāāāāāāā“āāāāāāāā“āāāāāāāā“āāāāāāāā“āāāāāāāā“āāāāāāāāāā ā ā°āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā End āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāÆ
InĀ [14]:
Copied!
# build formula, target (dependent variable) ~ features (independent variables)
# build model, fit the formula to the training data using a logistic algorithm (logit)
est = logit(formula="default ~ fico_score", data=df_train).fit()
# print the results of the model (est).
# Examine; Pseudo R-square
print(est.summary())
# build formula, target (dependent variable) ~ features (independent variables)
# build model, fit the formula to the training data using a logistic algorithm (logit)
est = logit(formula="default ~ fico_score", data=df_train).fit()
# print the results of the model (est).
# Examine; Pseudo R-square
print(est.summary())
Optimization terminated successfully. Current function value: 0.428098 Iterations 6 Logit Regression Results ============================================================================== Dep. Variable: default No. Observations: 6661 Model: Logit Df Residuals: 6659 Method: MLE Df Model: 1 Date: Fri, 30 May 2025 Pseudo R-squ.: 0.02590 Time: 12:45:42 Log-Likelihood: -2851.6 converged: True LL-Null: -2927.4 Covariance Type: nonrobust LLR p-value: 7.607e-35 ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 6.5097 0.688 9.455 0.000 5.160 7.859 fico_score -0.0116 0.001 -11.792 0.000 -0.014 -0.010 ==============================================================================
InĀ [15]:
Copied!
# apply the model (est) to the test data and make predictions
preds = est.predict(df_test)
df_test["predicted_probability"] = preds
# print top 6 predicted probabilities
df_test.head(6)
# apply the model (est) to the test data and make predictions
preds = est.predict(df_test)
df_test["predicted_probability"] = preds
# print top 6 predicted probabilities
df_test.head(6)
Out[15]:
default | fico_score | predicted_probability | |
---|---|---|---|
1393 | 1 | 692 | 0.181684 |
3933 | 0 | 672 | 0.218686 |
5889 | 0 | 707 | 0.157267 |
9452 | 0 | 702 | 0.165095 |
5003 | 0 | 697 | 0.173233 |
8064 | 1 | 702 | 0.165095 |
InĀ [16]:
Copied!
# test for 'predicted_probability > 0.5, if yes assign will_default to 1, otherwise to 0
df_test["will_default"] = np.where(df_test["predicted_probability"] > 0.25, 1, 0)
df_test.head(6)
print(df_test)
# test for 'predicted_probability > 0.5, if yes assign will_default to 1, otherwise to 0
df_test["will_default"] = np.where(df_test["predicted_probability"] > 0.25, 1, 0)
df_test.head(6)
print(df_test)
default fico_score predicted_probability will_default 1393 1 692 0.181684 0 3933 0 672 0.218686 0 5889 0 707 0.157267 0 9452 0 702 0.165095 0 5003 0 697 0.173233 0 ... ... ... ... ... 8651 0 677 0.208953 0 2222 0 672 0.218686 0 7909 1 657 0.249813 0 1047 0 742 0.110654 0 6932 0 732 0.122576 0 [2855 rows x 4 columns]
Confusion Matrix¶
InĀ [17]:
Copied!
# Evaluation Metrics
# print confusion matrix with labels
# Plot the confusion matrix with the custom Seaborn-based colormap
disp = ConfusionMatrixDisplay.from_predictions(
df_test["default"],
df_test["will_default"],
display_labels=["No Default", "Default"],
cmap="Blues",
)
plt.title("Confusion Matrix")
plt.show()
# print accuracy
print("Accuracy: " + str(accuracy_score(df_test["default"], df_test["will_default"])))
# Evaluation Metrics
# print confusion matrix with labels
# Plot the confusion matrix with the custom Seaborn-based colormap
disp = ConfusionMatrixDisplay.from_predictions(
df_test["default"],
df_test["will_default"],
display_labels=["No Default", "Default"],
cmap="Blues",
)
plt.title("Confusion Matrix")
plt.show()
# print accuracy
print("Accuracy: " + str(accuracy_score(df_test["default"], df_test["will_default"])))
Accuracy: 0.8217162872154116
InĀ [18]:
Copied!
matrix = confusion_matrix(df_test["default"], df_test["will_default"])
# Normalize the matrix to get percentages
normalized_matrix = matrix / np.sum(matrix)
# Create label overlay
labels = ["True Neg", "False Pos", "False Neg", "True Pos"]
labels = np.asarray(labels).reshape(2, 2)
# Format labels with percentages + class names
annot = np.empty_like(labels, dtype=object)
for i in range(2):
for j in range(2):
annot[i, j] = f"{labels[i, j]}\n{normalized_matrix[i, j]:.2%}"
# Plot heatmap with combined labels and percentages
sns.heatmap(normalized_matrix, annot=annot, fmt="", cmap="Blues", cbar=False)
plt.title("Confusion Matrix with Custom Labels")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.xticks([0.5, 1.5], ["No Default", "Default"])
plt.yticks([0.5, 1.5], ["No Default", "Default"], rotation=0)
plt.show()
matrix = confusion_matrix(df_test["default"], df_test["will_default"])
# Normalize the matrix to get percentages
normalized_matrix = matrix / np.sum(matrix)
# Create label overlay
labels = ["True Neg", "False Pos", "False Neg", "True Pos"]
labels = np.asarray(labels).reshape(2, 2)
# Format labels with percentages + class names
annot = np.empty_like(labels, dtype=object)
for i in range(2):
for j in range(2):
annot[i, j] = f"{labels[i, j]}\n{normalized_matrix[i, j]:.2%}"
# Plot heatmap with combined labels and percentages
sns.heatmap(normalized_matrix, annot=annot, fmt="", cmap="Blues", cbar=False)
plt.title("Confusion Matrix with Custom Labels")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.xticks([0.5, 1.5], ["No Default", "Default"])
plt.yticks([0.5, 1.5], ["No Default", "Default"], rotation=0)
plt.show()