Data Cleaning, EDA, and Statistical Modeling Workbook¶
Introduction¶
This notebook walks through a full data analysis pipeline — from raw data ingestion and encoding detection to exploratory data analysis (EDA), statistical modeling, and model performance evaluation. Key tools include chardet
and codecs
for encoding detection, pandas
and skimpy
for data exploration, statsmodels
for linear and logistic regression, and scikit-learn
for evaluation metrics such as confusion matrices and accuracy scores.
In [ ]:
Copied!
# Install dependencies, if needed
!pip install skimpy
# Install dependencies, if needed
!pip install skimpy
In [ ]:
Copied!
import chardet
import pandas as pd
# Read the CSV file using the detected encoding and specifying the delimiter
url = "https://docs.google.com/spreadsheets/d/10L8BpkV4q1Zsou4daYoWul_8PFA9rsv2/export?format=csv&id=10L8BpkV4q1Zsou4daYoWul_8PFA9rsv2&gid=1710894028"
df = pd.read_csv(url, index_col=False)
df.head()
import chardet
import pandas as pd
# Read the CSV file using the detected encoding and specifying the delimiter
url = "https://docs.google.com/spreadsheets/d/10L8BpkV4q1Zsou4daYoWul_8PFA9rsv2/export?format=csv&id=10L8BpkV4q1Zsou4daYoWul_8PFA9rsv2&gid=1710894028"
df = pd.read_csv(url, index_col=False)
df.head()
In [ ]:
Copied!
# imports packages to be used in the code
import numpy as np
import codecs
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import sklearn as skl
from statsmodels.formula.api import ols
from statsmodels.formula.api import logit
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from skimpy import skim
print(skl.__version__)
df
# imports packages to be used in the code
import numpy as np
import codecs
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import sklearn as skl
from statsmodels.formula.api import ols
from statsmodels.formula.api import logit
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from skimpy import skim
print(skl.__version__)
df
In [ ]:
Copied!
# shows first 6 rows of dataframe
df.head(6)
# shows first 6 rows of dataframe
df.head(6)
In [ ]:
Copied!
df.tail(6)
df.tail(6)
In [ ]:
Copied!
df = df[["default", "fico_score"]]
df = df[["default", "fico_score"]]
In [ ]:
Copied!
df
df
In [ ]:
Copied!
skim(df)
skim(df)
In [ ]:
Copied!
# plot target variable
# plt.scatter(df['default'], df['inquiries'],alpha =0.0, s=200)
# df = pd.read_excel('loans.xlsx')
plt.scatter(df["default"], df["fico_score"], alpha=1.0, s=200)
plt.title("fico vs. default")
plt.xlabel("default")
plt.ylabel("fico")
# plot target variable
# plt.scatter(df['default'], df['inquiries'],alpha =0.0, s=200)
# df = pd.read_excel('loans.xlsx')
plt.scatter(df["default"], df["fico_score"], alpha=1.0, s=200)
plt.title("fico vs. default")
plt.xlabel("default")
plt.ylabel("fico")
In [ ]:
Copied!
# data split into 70% train and 30% test
df_train, df_test = train_test_split(df, test_size=0.3)
# data split into 70% train and 30% test
df_train, df_test = train_test_split(df, test_size=0.3)
In [ ]:
Copied!
print(df_train)
# Save the DataFrame to a CSV file
# Replace 'file_path' with the path where you want to save the file
file_path = "df_train.csv"
df_train.to_csv(file_path, index=False)
print(df_train)
# Save the DataFrame to a CSV file
# Replace 'file_path' with the path where you want to save the file
file_path = "df_train.csv"
df_train.to_csv(file_path, index=False)
In [ ]:
Copied!
skim(df_train)
skim(df_train)
In [ ]:
Copied!
skim(df_test)
skim(df_test)
In [ ]:
Copied!
# build formula, target (dependent variable) ~ features (independent variables)
# build model, fit the formula to the training data using a logistic algorithm (logit)
est = logit(formula="default ~ fico_score", data=df_train).fit()
# print the results of the model (est).
# Examine; Pseudo R-square
print(est.summary())
# build formula, target (dependent variable) ~ features (independent variables)
# build model, fit the formula to the training data using a logistic algorithm (logit)
est = logit(formula="default ~ fico_score", data=df_train).fit()
# print the results of the model (est).
# Examine; Pseudo R-square
print(est.summary())
In [ ]:
Copied!
# apply the model (est) to the test data and make predictions
preds = est.predict(df_test)
df_test["predicted_probability"] = preds
# print top 6 predicted probabilities
df_test.head(6)
# apply the model (est) to the test data and make predictions
preds = est.predict(df_test)
df_test["predicted_probability"] = preds
# print top 6 predicted probabilities
df_test.head(6)
In [ ]:
Copied!
# test for 'predicted_probability > 0.5, if yes assign will_default to 1, otherwise to 0
df_test["will_default"] = np.where(df_test["predicted_probability"] > 0.25, 1, 0)
df_test.head(6)
print(df_test)
# test for 'predicted_probability > 0.5, if yes assign will_default to 1, otherwise to 0
df_test["will_default"] = np.where(df_test["predicted_probability"] > 0.25, 1, 0)
df_test.head(6)
print(df_test)
Confusion Matrix¶
In [ ]:
Copied!
# Evaluation Metrics
# print confusion matrix with labels
# Plot the confusion matrix with the custom Seaborn-based colormap
disp = ConfusionMatrixDisplay.from_predictions(
df_test["default"],
df_test["will_default"],
display_labels=["No Default", "Default"],
cmap="Blues",
)
plt.title("Confusion Matrix")
plt.show()
# print accuracy
print("Accuracy: " + str(accuracy_score(df_test["default"], df_test["will_default"])))
# Evaluation Metrics
# print confusion matrix with labels
# Plot the confusion matrix with the custom Seaborn-based colormap
disp = ConfusionMatrixDisplay.from_predictions(
df_test["default"],
df_test["will_default"],
display_labels=["No Default", "Default"],
cmap="Blues",
)
plt.title("Confusion Matrix")
plt.show()
# print accuracy
print("Accuracy: " + str(accuracy_score(df_test["default"], df_test["will_default"])))
In [ ]:
Copied!
matrix = confusion_matrix(df_test["default"], df_test["will_default"])
# Normalize the matrix to get percentages
normalized_matrix = matrix / np.sum(matrix)
# Create label overlay
labels = ["True Neg", "False Pos", "False Neg", "True Pos"]
labels = np.asarray(labels).reshape(2, 2)
# Format labels with percentages + class names
annot = np.empty_like(labels, dtype=object)
for i in range(2):
for j in range(2):
annot[i, j] = f"{labels[i, j]}\n{normalized_matrix[i, j]:.2%}"
# Plot heatmap with combined labels and percentages
sns.heatmap(normalized_matrix, annot=annot, fmt="", cmap="Blues", cbar=False)
plt.title("Confusion Matrix with Custom Labels")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.xticks([0.5, 1.5], ["No Default", "Default"])
plt.yticks([0.5, 1.5], ["No Default", "Default"], rotation=0)
plt.show()
matrix = confusion_matrix(df_test["default"], df_test["will_default"])
# Normalize the matrix to get percentages
normalized_matrix = matrix / np.sum(matrix)
# Create label overlay
labels = ["True Neg", "False Pos", "False Neg", "True Pos"]
labels = np.asarray(labels).reshape(2, 2)
# Format labels with percentages + class names
annot = np.empty_like(labels, dtype=object)
for i in range(2):
for j in range(2):
annot[i, j] = f"{labels[i, j]}\n{normalized_matrix[i, j]:.2%}"
# Plot heatmap with combined labels and percentages
sns.heatmap(normalized_matrix, annot=annot, fmt="", cmap="Blues", cbar=False)
plt.title("Confusion Matrix with Custom Labels")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.xticks([0.5, 1.5], ["No Default", "Default"])
plt.yticks([0.5, 1.5], ["No Default", "Default"], rotation=0)
plt.show()