Regression¶
Blue Bikes Rental Data¶
Lets start with importing what we need...
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import norm
from skimpy import skim
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import datasets
Lets read in the data into a Pandas 'dataframe'. We'll see that this is a nifty datastruture. We'll take a look at the data frame shortly.
url = "https://docs.google.com/spreadsheets/d/1rgHhBRqSZG5sfAMbbEVP2-IA0tAw7PftO6JLmJL4dY0/export?format=csv&id=1rgHhBRqSZG5sfAMbbEVP2-IA0tAw7PftO6JLmJL4dY0&gid=713155364"
df = pd.read_csv(url)
df.info
<bound method DataFrame.info of rentals month day hour day_of_week weekend temp temp_wb \ 0 4 1 1 0 Mon 0 2 0 1 6 1 1 1 Mon 0 1 0 2 6 1 1 2 Mon 0 1 -1 3 1 1 1 5 Mon 0 0 -2 4 3 1 1 6 Mon 0 0 -2 ... ... ... ... ... ... ... ... ... 8598 36 12 31 19 Mon 0 39 36 8599 25 12 31 20 Mon 0 38 36 8600 13 12 31 21 Mon 0 37 36 8601 6 12 31 22 Mon 0 38 37 8602 7 12 31 23 Mon 0 39 38 rel_humidity windspeed precipitation 0 59 16 0.00 1 59 11 0.00 2 54 21 0.00 3 54 18 0.00 4 54 15 0.00 ... ... ... ... 8598 76 13 0.02 8599 86 14 0.04 8600 89 15 0.07 8601 89 14 0.09 8602 89 17 0.19 [8603 rows x 11 columns]>
df.head()
rentals | month | day | hour | day_of_week | weekend | temp | temp_wb | rel_humidity | windspeed | precipitation | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4 | 1 | 1 | 0 | Mon | 0 | 2 | 0 | 59 | 16 | 0.0 |
1 | 6 | 1 | 1 | 1 | Mon | 0 | 1 | 0 | 59 | 11 | 0.0 |
2 | 6 | 1 | 1 | 2 | Mon | 0 | 1 | -1 | 54 | 21 | 0.0 |
3 | 1 | 1 | 1 | 5 | Mon | 0 | 0 | -2 | 54 | 18 | 0.0 |
4 | 3 | 1 | 1 | 6 | Mon | 0 | 0 | -2 | 54 | 15 | 0.0 |
df.tail(6)
rentals | month | day | hour | day_of_week | weekend | temp | temp_wb | rel_humidity | windspeed | precipitation | |
---|---|---|---|---|---|---|---|---|---|---|---|
8597 | 69 | 12 | 31 | 18 | Mon | 0 | 42 | 37 | 60 | 10 | 0.00 |
8598 | 36 | 12 | 31 | 19 | Mon | 0 | 39 | 36 | 76 | 13 | 0.02 |
8599 | 25 | 12 | 31 | 20 | Mon | 0 | 38 | 36 | 86 | 14 | 0.04 |
8600 | 13 | 12 | 31 | 21 | Mon | 0 | 37 | 36 | 89 | 15 | 0.07 |
8601 | 6 | 12 | 31 | 22 | Mon | 0 | 38 | 37 | 89 | 14 | 0.09 |
8602 | 7 | 12 | 31 | 23 | Mon | 0 | 39 | 38 | 89 | 17 | 0.19 |
Looking at the data, its clear that things like 'day_of_week' or 'day' are not truly numerical (in the sense of a temperature or a price) but rather categorical. So we go ahead an declare those columns of the dataframe as such.
categorical_variables = ["month", "day", "hour", "day_of_week", "weekend"]
df[categorical_variables] = df[categorical_variables].astype("category")
skim(df)
╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮ │ Data Summary Data Types Categories │ │ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓ ┏━━━━━━━━━━━━━━━━━━━━━━━┓ │ │ ┃ Dataframe ┃ Values ┃ ┃ Column Type ┃ Count ┃ ┃ Categorical Variables ┃ │ │ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ ┡━━━━━━━━━━━━━━━━━━━━━━━┩ │ │ │ Number of rows │ 8603 │ │ int64 │ 5 │ │ month │ │ │ │ Number of columns │ 11 │ │ category │ 5 │ │ day │ │ │ └───────────────────┴────────┘ │ float64 │ 1 │ │ hour │ │ │ └─────────────┴───────┘ │ day_of_week │ │ │ │ weekend │ │ │ └───────────────────────┘ │ │ number │ │ ┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━┳━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━━┳━━━━━━━━━┓ │ │ ┃ column ┃ NA ┃ NA % ┃ mean ┃ sd ┃ p0 ┃ p25 ┃ p50 ┃ p75 ┃ p100 ┃ hist ┃ │ │ ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━╇━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━━╇━━━━━━━━━┩ │ │ │ rentals │ 0 │ 0 │ 205.1 │ 227.1 │ 1 │ 32 │ 120 │ 306 │ 1299 │ ▇▃▁ │ │ │ │ temp │ 0 │ 0 │ 53.21 │ 18.29 │ -2 │ 38 │ 52 │ 69 │ 97 │ ▂▇▆▇▂ │ │ │ │ temp_wb │ 0 │ 0 │ 47.84 │ 16.91 │ -3 │ 34 │ 48 │ 62 │ 81 │ ▂▇▇▇▅ │ │ │ │ rel_humidity │ 0 │ 0 │ 67.11 │ 18.92 │ 16 │ 53 │ 68 │ 84 │ 100 │ ▁▃▇▇▇▇ │ │ │ │ windspeed │ 0 │ 0 │ 11.1 │ 5.455 │ 0 │ 7 │ 10 │ 14 │ 48 │ ▃▇▂ │ │ │ │ precipitation │ 0 │ 0 │ 0.00544 │ 0.03407 │ 0 │ 0 │ 0 │ 0 │ 1.12 │ ▇ │ │ │ └────────────────────┴──────┴────────┴────────────┴────────────┴─────┴───────┴──────┴──────┴───────┴─────────┘ │ │ category │ │ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓ │ │ ┃ column ┃ NA ┃ NA % ┃ ordered ┃ unique ┃ │ │ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩ │ │ │ month │ 0 │ 0 │ False │ 12 │ │ │ │ day │ 0 │ 0 │ False │ 31 │ │ │ │ hour │ 0 │ 0 │ False │ 24 │ │ │ │ day_of_week │ 0 │ 0 │ False │ 7 │ │ │ │ weekend │ 0 │ 0 │ False │ 2 │ │ │ └──────────────────────────────────┴───────────┴────────────────┴───────────────────────┴────────────────────┘ │ ╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯
Lets produce a histogram of rentals data. We've selected bins based on what we saw of the data from the summary we gleaned using 'skim'.
plt.hist(df["rentals"], bins=range(0, 1400, 50))
plt.title("Rentals Data")
plt.xlabel("Number of Rentals")
plt.ylabel("Number of Data Points")
Text(0, 0.5, 'Number of Data Points')
The mean number of rental is ~205, with a standard deviation of 227.
print(
"Average no. of Rentals: "
+ str(np.mean(df["rentals"]))
+ " Standard Dev: "
+ str(np.std(df["rentals"]))
)
Average no. of Rentals: 205.08578402882716 Standard Dev: 227.0392651748052
Next a histogram at a temp of 25F.
df_cold = df[df["temp"] == 25]
plt.hist(df_cold["rentals"], bins=range(0, 1400, 50))
plt.title("Rentals Data at Temp of 25F")
plt.xlabel("Number of Rentals")
plt.ylabel("Number of Data Points")
Text(0, 0.5, 'Number of Data Points')
Standard deviation is now a lot smaller..
print(
"Average no. of Rentals at 25F: "
+ str(np.mean(df_cold["rentals"]))
+ " Standard Dev: "
+ str(np.std(df_cold["rentals"]))
)
Average no. of Rentals at 25F: 72.55384615384615 Standard Dev: 90.4802623218041
A scatter plot of rentals across hour of rental shows that knowing the hour of rental could potentially reduce variability dramatically.
plt.scatter(df["hour"], df["rentals"], s=2)
plt.title("Rentals vs. Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("# Rentals")
Text(0, 0.5, '# Rentals')
Now lets take a look at rentals vs. temperature. We also look at the mean (or average) number of rentals and notice a definite straight-line like trend
plt.scatter(df["temp"], df["rentals"], s=2)
plt.title("Rentals vs. Temp (mean value in orange)")
plt.xlabel("Temp")
plt.ylabel("# Rentals")
plt.scatter(df["temp"].unique(), df.groupby("temp")["rentals"].mean())
<matplotlib.collections.PathCollection at 0x2436e5d7c50>
The above scatter plot is the key motivation for linear regression which we will turn to next. In particular, we will run a number of regressions of the type 'rentals' vs. some other independent variable (eg. temp) that we believe to have explanatory power.
est = ols(formula="rentals ~ temp", data=df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: rentals R-squared: 0.235 Model: OLS Adj. R-squared: 0.235 Method: Least Squares F-statistic: 2641. Date: Sun, 18 May 2025 Prob (F-statistic): 0.00 Time: 16:48:32 Log-Likelihood: -57727. No. Observations: 8603 AIC: 1.155e+05 Df Residuals: 8601 BIC: 1.155e+05 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -115.1099 6.588 -17.473 0.000 -128.024 -102.196 temp 6.0176 0.117 51.395 0.000 5.788 6.247 ============================================================================== Omnibus: 2088.128 Durbin-Watson: 0.424 Prob(Omnibus): 0.000 Jarque-Bera (JB): 5175.982 Skew: 1.332 Prob(JB): 0.00 Kurtosis: 5.709 Cond. No. 173. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
b0 = -115.1099
b1 = 6.0176
# Generate some example data
x = range(100)
y = [b0 + b1 * i for i in x]
plt.scatter(df["temp"], df["rentals"], s=2)
plt.title("Rentals vs. Temp")
plt.xlabel("Temp")
plt.ylabel("# Rentals")
plt.plot(x, y, color="red")
[<matplotlib.lines.Line2D at 0x2436e57e690>]
print(np.sqrt(est.mse_resid))
198.6078273046469
est = ols(formula="rentals ~ rel_humidity", data=df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: rentals R-squared: 0.028 Model: OLS Adj. R-squared: 0.028 Method: Least Squares F-statistic: 249.5 Date: Sun, 18 May 2025 Prob (F-statistic): 1.99e-55 Time: 16:49:10 Log-Likelihood: -58756. No. Observations: 8603 AIC: 1.175e+05 Df Residuals: 8601 BIC: 1.175e+05 Df Model: 1 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- Intercept 340.2762 8.892 38.266 0.000 322.845 357.707 rel_humidity -2.0146 0.128 -15.796 0.000 -2.265 -1.765 ============================================================================== Omnibus: 2576.191 Durbin-Watson: 0.336 Prob(Omnibus): 0.000 Jarque-Bera (JB): 6709.611 Skew: 1.633 Prob(JB): 0.00 Kurtosis: 5.837 Cond. No. 257. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
b0 = 340.2762
b1 = -2.0146
# Generate some example data
x = range(100)
y = [b0 + b1 * i for i in x]
plt.scatter(df["rel_humidity"], df["rentals"], s=2)
plt.title("Rentals vs. Rel Humidity")
plt.xlabel("Rel Humidity")
plt.ylabel("# Rentals")
plt.plot(x, y, color="red")
[<matplotlib.lines.Line2D at 0x2436e51e590>]
print(np.sqrt(est.mse_resid))
223.84213890754862
plt.scatter(df["weekend"], df["rentals"], s=2)
plt.title("Rentals vs. Weekend [1=yes, 0=no]")
plt.xlabel("Weekend")
plt.ylabel("# Rentals")
Text(0, 0.5, '# Rentals')
est = ols(formula="rentals ~ weekend", data=df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: rentals R-squared: 0.015 Model: OLS Adj. R-squared: 0.015 Method: Least Squares F-statistic: 132.1 Date: Sun, 18 May 2025 Prob (F-statistic): 2.42e-30 Time: 16:50:01 Log-Likelihood: -58814. No. Observations: 8603 AIC: 1.176e+05 Df Residuals: 8601 BIC: 1.176e+05 Df Model: 1 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- Intercept 222.8186 2.878 77.417 0.000 217.177 228.460 weekend[T.1] -61.6884 5.368 -11.492 0.000 -72.211 -51.166 ============================================================================== Omnibus: 2401.797 Durbin-Watson: 0.335 Prob(Omnibus): 0.000 Jarque-Bera (JB): 5705.729 Skew: 1.569 Prob(JB): 0.00 Kurtosis: 5.464 Cond. No. 2.43 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
plt.scatter(df["day_of_week"], df["rentals"], s=2)
plt.title("Rentals vs. day of week")
plt.xlabel("Days of the Week")
plt.ylabel("# Rentals")
Text(0, 0.5, '# Rentals')
est = ols(formula="rentals ~ day_of_week", data=df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: rentals R-squared: 0.017 Model: OLS Adj. R-squared: 0.017 Method: Least Squares F-statistic: 25.13 Date: Sun, 18 May 2025 Prob (F-statistic): 9.94e-30 Time: 16:50:17 Log-Likelihood: -58805. No. Observations: 8603 AIC: 1.176e+05 Df Residuals: 8596 BIC: 1.177e+05 Df Model: 6 Covariance Type: nonrobust ====================================================================================== coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------------- Intercept 225.2480 6.441 34.970 0.000 212.622 237.874 day_of_week[T.Mon] -22.2210 9.040 -2.458 0.014 -39.942 -4.500 day_of_week[T.Sat] -57.2245 9.082 -6.301 0.000 -75.027 -39.422 day_of_week[T.Sun] -71.0166 9.083 -7.818 0.000 -88.822 -53.211 day_of_week[T.Thu] 6.8572 9.119 0.752 0.452 -11.017 24.732 day_of_week[T.Tue] -6.3961 9.136 -0.700 0.484 -24.304 11.512 day_of_week[T.Wed] 10.2108 9.107 1.121 0.262 -7.642 28.063 ============================================================================== Omnibus: 2399.606 Durbin-Watson: 0.335 Prob(Omnibus): 0.000 Jarque-Bera (JB): 5702.983 Skew: 1.567 Prob(JB): 0.00 Kurtosis: 5.468 Cond. No. 7.89 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
As a first step with multiple linear regression, we look for correlations between our independent variables. The reason is simple enough -- if two distinct variables are highly correlated, we may not want to include both. The reason for this is that it causes something called 'collinearity' which we will examine more carefully a bit later.
corr_matrix = df.corr(numeric_only=True)
corr_matrix
rentals | temp | temp_wb | rel_humidity | windspeed | precipitation | |
---|---|---|---|---|---|---|
rentals | 1.000000 | 0.484716 | 0.425792 | -0.167903 | -0.019345 | -0.071619 |
temp | 0.484716 | 1.000000 | 0.977801 | 0.126377 | -0.148411 | 0.018469 |
temp_wb | 0.425792 | 0.977801 | 1.000000 | 0.319822 | -0.188347 | 0.057638 |
rel_humidity | -0.167903 | 0.126377 | 0.319822 | 1.000000 | -0.218430 | 0.206801 |
windspeed | -0.019345 | -0.148411 | -0.188347 | -0.218430 | 1.000000 | 0.069177 |
precipitation | -0.071619 | 0.018469 | 0.057638 | 0.206801 | 0.069177 | 1.000000 |
sns.heatmap(corr_matrix, annot=True)
<Axes: >
est = ols(formula="rentals ~ temp + rel_humidity", data=df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: rentals R-squared: 0.288 Model: OLS Adj. R-squared: 0.288 Method: Least Squares F-statistic: 1742. Date: Sun, 18 May 2025 Prob (F-statistic): 0.00 Time: 16:51:45 Log-Likelihood: -57416. No. Observations: 8603 AIC: 1.148e+05 Df Residuals: 8600 BIC: 1.149e+05 Df Model: 2 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- Intercept 52.9562 9.175 5.772 0.000 34.971 70.941 temp 6.3829 0.114 56.066 0.000 6.160 6.606 rel_humidity -2.7942 0.110 -25.395 0.000 -3.010 -2.579 ============================================================================== Omnibus: 2380.442 Durbin-Watson: 0.458 Prob(Omnibus): 0.000 Jarque-Bera (JB): 6789.829 Skew: 1.454 Prob(JB): 0.00 Kurtosis: 6.237 Cond. No. 391. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
What is the standard deviation of our residuals? Our best guess is the square root of the average squared residuals.
print(np.sqrt(est.mse_resid))
191.56676007313123
So if we decided to predict the number of rentals when the temp is 60F and the relative humidity is 50%, the point estimate is simply
print(
"y_pred = 52.9562 + 6.3829*60 - 2.7942*50 = "
+ str(52.9562 + 6.3829 * 60 - 2.7942 * 50)
)
y_pred = 52.9562 + 6.3829*60 - 2.7942*50 = 296.2202
So, a probabilistic estimate will be well modeled as a Normal random variable with mean 296.22 and standard deviation 191.63. We can now answer a question like 'What is the chance that the number of rentals exceeds 400 when the temp is 60F and the relative humidity is 50%?'. The answer: the same as the probability that a Normal random variable with mean 296.22 and standard deviation 191.63 exceeds 400!
print(
"Probability of at least 400 rentals at 60F with 50% humidity: "
+ str(1 - norm.cdf(400, loc=296.22, scale=191.63))
)
Probability of at least 400 rentals at 60F with 50% humidity: 0.2940592857195723
est.conf_int(alpha=0.05)
0 | 1 | |
---|---|---|
Intercept | 34.971280 | 70.941053 |
temp | 6.159768 | 6.606104 |
rel_humidity | -3.009922 | -2.578540 |
1 - norm.cdf(10, loc=8, scale=2)
np.float64(0.15865525393145707)
random_data = np.random.rand(len(df["rentals"]))
df["random"] = random_data
df.head(6)
rentals | month | day | hour | day_of_week | weekend | temp | temp_wb | rel_humidity | windspeed | precipitation | random | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4 | 1 | 1 | 0 | Mon | 0 | 2 | 0 | 59 | 16 | 0.0 | 0.064791 |
1 | 6 | 1 | 1 | 1 | Mon | 0 | 1 | 0 | 59 | 11 | 0.0 | 0.144366 |
2 | 6 | 1 | 1 | 2 | Mon | 0 | 1 | -1 | 54 | 21 | 0.0 | 0.367953 |
3 | 1 | 1 | 1 | 5 | Mon | 0 | 0 | -2 | 54 | 18 | 0.0 | 0.724437 |
4 | 3 | 1 | 1 | 6 | Mon | 0 | 0 | -2 | 54 | 15 | 0.0 | 0.151816 |
5 | 3 | 1 | 1 | 7 | Mon | 0 | 0 | -2 | 54 | 11 | 0.0 | 0.901812 |
df["rentals"].corr(df["random"])
np.float64(-0.003566719029223307)
plt.scatter(df["random"], df["rentals"], s=2)
plt.title("Rentals vs. Random Data")
plt.xlabel("Random")
plt.ylabel("# Rentals")
Text(0, 0.5, '# Rentals')
est = ols(formula="rentals ~ temp + rel_humidity + random", data=df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: rentals R-squared: 0.288 Model: OLS Adj. R-squared: 0.288 Method: Least Squares F-statistic: 1161. Date: Sun, 18 May 2025 Prob (F-statistic): 0.00 Time: 16:54:42 Log-Likelihood: -57416. No. Observations: 8603 AIC: 1.148e+05 Df Residuals: 8599 BIC: 1.149e+05 Df Model: 3 Covariance Type: nonrobust ================================================================================ coef std err t P>|t| [0.025 0.975] -------------------------------------------------------------------------------- Intercept 53.3817 9.867 5.410 0.000 34.040 72.723 temp 6.3828 0.114 56.060 0.000 6.160 6.606 rel_humidity -2.7943 0.110 -25.393 0.000 -3.010 -2.579 random -0.8449 7.204 -0.117 0.907 -14.967 13.277 ============================================================================== Omnibus: 2380.705 Durbin-Watson: 0.458 Prob(Omnibus): 0.000 Jarque-Bera (JB): 6791.405 Skew: 1.454 Prob(JB): 0.00 Kurtosis: 6.238 Cond. No. 445. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
est = ols(formula="rentals ~ temp + temp_wb", data=df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: rentals R-squared: 0.288 Model: OLS Adj. R-squared: 0.288 Method: Least Squares F-statistic: 1738. Date: Sun, 18 May 2025 Prob (F-statistic): 0.00 Time: 16:54:50 Log-Likelihood: -57420. No. Observations: 8603 AIC: 1.148e+05 Df Residuals: 8600 BIC: 1.149e+05 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -119.2405 6.359 -18.752 0.000 -131.705 -106.776 temp 19.3338 0.539 35.858 0.000 18.277 20.391 temp_wb -14.7260 0.583 -25.258 0.000 -15.869 -13.583 ============================================================================== Omnibus: 2473.945 Durbin-Watson: 0.460 Prob(Omnibus): 0.000 Jarque-Bera (JB): 7319.712 Skew: 1.497 Prob(JB): 0.00 Kurtosis: 6.384 Cond. No. 233. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
est = ols(
formula="rentals ~ temp + rel_humidity + windspeed + precipitation", data=df
).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: rentals R-squared: 0.290 Model: OLS Adj. R-squared: 0.289 Method: Least Squares F-statistic: 876.1 Date: Sun, 18 May 2025 Prob (F-statistic): 0.00 Time: 16:54:57 Log-Likelihood: -57409. No. Observations: 8603 AIC: 1.148e+05 Df Residuals: 8598 BIC: 1.149e+05 Df Model: 4 Covariance Type: nonrobust ================================================================================= coef std err t P>|t| [0.025 0.975] --------------------------------------------------------------------------------- Intercept 40.8156 11.488 3.553 0.000 18.297 63.334 temp 6.3960 0.115 55.782 0.000 6.171 6.621 rel_humidity -2.6789 0.115 -23.236 0.000 -2.905 -2.453 windspeed 0.4503 0.394 1.144 0.253 -0.321 1.222 precipitation -237.9935 62.368 -3.816 0.000 -360.251 -115.736 ============================================================================== Omnibus: 2370.634 Durbin-Watson: 0.462 Prob(Omnibus): 0.000 Jarque-Bera (JB): 6743.262 Skew: 1.449 Prob(JB): 0.00 Kurtosis: 6.226 Cond. No. 2.68e+03 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.68e+03. This might indicate that there are strong multicollinearity or other numerical problems.
est = ols(formula="rentals ~ temp + rel_humidity + precipitation", data=df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: rentals R-squared: 0.289 Model: OLS Adj. R-squared: 0.289 Method: Least Squares F-statistic: 1168. Date: Sun, 18 May 2025 Prob (F-statistic): 0.00 Time: 16:55:03 Log-Likelihood: -57410. No. Observations: 8603 AIC: 1.148e+05 Df Residuals: 8599 BIC: 1.149e+05 Df Model: 3 Covariance Type: nonrobust ================================================================================= coef std err t P>|t| [0.025 0.975] --------------------------------------------------------------------------------- Intercept 48.6214 9.242 5.261 0.000 30.504 66.739 temp 6.3796 0.114 56.076 0.000 6.157 6.603 rel_humidity -2.7084 0.112 -24.104 0.000 -2.929 -2.488 precipitation -229.4505 61.921 -3.706 0.000 -350.831 -108.070 ============================================================================== Omnibus: 2367.029 Durbin-Watson: 0.462 Prob(Omnibus): 0.000 Jarque-Bera (JB): 6731.705 Skew: 1.447 Prob(JB): 0.00 Kurtosis: 6.225 Cond. No. 2.64e+03 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.64e+03. This might indicate that there are strong multicollinearity or other numerical problems.
est = ols(formula="rentals ~ temp + temp_wb", data=df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: rentals R-squared: 0.288 Model: OLS Adj. R-squared: 0.288 Method: Least Squares F-statistic: 1738. Date: Sun, 18 May 2025 Prob (F-statistic): 0.00 Time: 16:55:12 Log-Likelihood: -57420. No. Observations: 8603 AIC: 1.148e+05 Df Residuals: 8600 BIC: 1.149e+05 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -119.2405 6.359 -18.752 0.000 -131.705 -106.776 temp 19.3338 0.539 35.858 0.000 18.277 20.391 temp_wb -14.7260 0.583 -25.258 0.000 -15.869 -13.583 ============================================================================== Omnibus: 2473.945 Durbin-Watson: 0.460 Prob(Omnibus): 0.000 Jarque-Bera (JB): 7319.712 Skew: 1.497 Prob(JB): 0.00 Kurtosis: 6.384 Cond. No. 233. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(np.sqrt(est.mse_resid))
191.6383021375479
Let's split our data into a training set (70% of the data) and a test set (the remaining 30%). We will use the train_test_split utility we imported to accomplish this.
df_train, df_test = train_test_split(df, test_size=0.3)
Let's retrain a model on just the training data:
est_train = ols(
formula="rentals ~ temp + rel_humidity + precipitation", data=df_train
).fit()
print(est_train.summary())
OLS Regression Results ============================================================================== Dep. Variable: rentals R-squared: 0.296 Model: OLS Adj. R-squared: 0.296 Method: Least Squares F-statistic: 843.1 Date: Sun, 18 May 2025 Prob (F-statistic): 0.00 Time: 16:55:50 Log-Likelihood: -40218. No. Observations: 6022 AIC: 8.044e+04 Df Residuals: 6018 BIC: 8.047e+04 Df Model: 3 Covariance Type: nonrobust ================================================================================= coef std err t P>|t| [0.025 0.975] --------------------------------------------------------------------------------- Intercept 49.3205 11.047 4.464 0.000 27.664 70.977 temp 6.4834 0.137 47.480 0.000 6.216 6.751 rel_humidity -2.7822 0.134 -20.691 0.000 -3.046 -2.519 precipitation -286.4779 81.383 -3.520 0.000 -446.017 -126.938 ============================================================================== Omnibus: 1594.891 Durbin-Watson: 1.997 Prob(Omnibus): 0.000 Jarque-Bera (JB): 4308.606 Skew: 1.411 Prob(JB): 0.00 Kurtosis: 6.035 Cond. No. 2.88e+03 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.88e+03. This might indicate that there are strong multicollinearity or other numerical problems.
How well does it do on the test data? Lets use the model we learned on the training data to predict rentals on the test data and then measure the OOS R^2
print(np.sqrt(est_train.mse_resid))
192.4637563274933
test_pred = est_train.predict(df_test)
print("OOS R-squared: " + str(r2_score(df_test["rentals"], test_pred)))
OOS R-squared: 0.27236571111267827
df_test.head()
rentals | month | day | hour | day_of_week | weekend | temp | temp_wb | rel_humidity | windspeed | precipitation | random | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
595 | 6 | 1 | 28 | 6 | Sun | 1 | 47 | 46 | 93 | 14 | 0.0 | 0.466239 |
3968 | 105 | 6 | 20 | 23 | Wed | 0 | 70 | 63 | 66 | 7 | 0.0 | 0.520552 |
1774 | 150 | 3 | 21 | 7 | Wed | 0 | 35 | 32 | 72 | 22 | 0.0 | 0.621599 |
741 | 64 | 2 | 3 | 13 | Sat | 1 | 27 | 21 | 37 | 17 | 0.0 | 0.644899 |
8010 | 29 | 12 | 7 | 5 | Fri | 0 | 35 | 32 | 70 | 15 | 0.0 | 0.804495 |
r = 50.7752 + 6.4101 * (46) - 2.7708 * (56) - 246.8192 * (0.0)
r
190.475
test_pred.head()
595 95.296704 3968 319.533277 1774 75.921915 741 121.431156 8010 81.486274 dtype: float64
plt.scatter(df_test["rentals"], test_pred, s=2)
plt.title("Rentals vs. Rentals")
plt.xlabel("Actual Rentals")
plt.ylabel("Predicted Rentals")
Text(0, 0.5, 'Predicted Rentals')
Diabetes Dataset¶
Lets load the publicly available diabetes dataset and print out a description of the dataset
diabetes = datasets.load_diabetes(as_frame=True)
diabetes_df = diabetes["frame"]
print(diabetes["DESCR"])
.. _diabetes_dataset: Diabetes dataset ---------------- Ten baseline variables, age, sex, body mass index, average blood pressure, and six blood serum measurements were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure of disease progression one year after baseline. **Data Set Characteristics:** :Number of Instances: 442 :Number of Attributes: First 10 columns are numeric predictive values :Target: Column 11 is a quantitative measure of disease progression one year after baseline :Attribute Information: - age age in years - sex - bmi body mass index - bp average blood pressure - s1 tc, total serum cholesterol - s2 ldl, low-density lipoproteins - s3 hdl, high-density lipoproteins - s4 tch, total cholesterol / HDL - s5 ltg, possibly log of serum triglycerides level - s6 glu, blood sugar level Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times the square root of `n_samples` (i.e. the sum of squares of each column totals 1). Source URL: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html For more information see: Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499. (https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)
Your task is to build the best linear regression model you can using this data to predict the 'target' field.
diabetes_df.head(6)
age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.038076 | 0.050680 | 0.061696 | 0.021872 | -0.044223 | -0.034821 | -0.043401 | -0.002592 | 0.019907 | -0.017646 | 151.0 |
1 | -0.001882 | -0.044642 | -0.051474 | -0.026328 | -0.008449 | -0.019163 | 0.074412 | -0.039493 | -0.068332 | -0.092204 | 75.0 |
2 | 0.085299 | 0.050680 | 0.044451 | -0.005670 | -0.045599 | -0.034194 | -0.032356 | -0.002592 | 0.002861 | -0.025930 | 141.0 |
3 | -0.089063 | -0.044642 | -0.011595 | -0.036656 | 0.012191 | 0.024991 | -0.036038 | 0.034309 | 0.022688 | -0.009362 | 206.0 |
4 | 0.005383 | -0.044642 | -0.036385 | 0.021872 | 0.003935 | 0.015596 | 0.008142 | -0.002592 | -0.031988 | -0.046641 | 135.0 |
5 | -0.092695 | -0.044642 | -0.040696 | -0.019442 | -0.068991 | -0.079288 | 0.041277 | -0.076395 | -0.041176 | -0.096346 | 97.0 |
skim(diabetes_df)
╭──────────────────────────────────────────────── skimpy summary ─────────────────────────────────────────────────╮ │ Data Summary Data Types │ │ ┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓ ┏━━━━━━━━━━━━━┳━━━━━━━┓ │ │ ┃ Dataframe ┃ Values ┃ ┃ Column Type ┃ Count ┃ │ │ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩ ┡━━━━━━━━━━━━━╇━━━━━━━┩ │ │ │ Number of rows │ 442 │ │ float64 │ 11 │ │ │ │ Number of columns │ 11 │ └─────────────┴───────┘ │ │ └───────────────────┴────────┘ │ │ number │ │ ┏━━━━━━━━━┳━━━━━┳━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┓ │ │ ┃ column ┃ NA ┃ NA % ┃ mean ┃ sd ┃ p0 ┃ p25 ┃ p50 ┃ p75 ┃ p100 ┃ hist ┃ │ │ ┡━━━━━━━━━╇━━━━━╇━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━┩ │ │ │ age │ 0 │ 0 │ -2.512e-19 │ 0.04762 │ -0.1072 │ -0.0373 │ 0.005383 │ 0.03808 │ 0.1107 │ ▂▅▆▇▆▁ │ │ │ │ sex │ 0 │ 0 │ 1.231e-17 │ 0.04762 │ -0.04464 │ -0.04464 │ -0.04464 │ 0.05068 │ 0.05068 │ ▇ ▇ │ │ │ │ bmi │ 0 │ 0 │ -2.246e-16 │ 0.04762 │ -0.09028 │ -0.03423 │ -0.007284 │ 0.03125 │ 0.1706 │ ▃▇▅▃▁ │ │ │ │ bp │ 0 │ 0 │ -4.798e-17 │ 0.04762 │ -0.1124 │ -0.03666 │ -0.00567 │ 0.03564 │ 0.132 │ ▁▆▇▅▃▁ │ │ │ │ s1 │ 0 │ 0 │ -1.381e-17 │ 0.04762 │ -0.1268 │ -0.03425 │ -0.004321 │ 0.02836 │ 0.1539 │ ▁▅▇▅▂▁ │ │ │ │ s2 │ 0 │ 0 │ 3.918e-17 │ 0.04762 │ -0.1156 │ -0.03036 │ -0.003819 │ 0.02984 │ 0.1988 │ ▂▇▇▃▁ │ │ │ │ s3 │ 0 │ 0 │ -5.777e-18 │ 0.04762 │ -0.1023 │ -0.03512 │ -0.006584 │ 0.02931 │ 0.1812 │ ▂▇▇▃▁ │ │ │ │ s4 │ 0 │ 0 │ -9.043e-18 │ 0.04762 │ -0.07639 │ -0.03949 │ -0.002592 │ 0.03431 │ 0.1852 │ ▇▆▅▂▁ │ │ │ │ s5 │ 0 │ 0 │ 9.294e-17 │ 0.04762 │ -0.1261 │ -0.03325 │ -0.001947 │ 0.03243 │ 0.1336 │ ▁▅▇▇▃▁ │ │ │ │ s6 │ 0 │ 0 │ 1.13e-17 │ 0.04762 │ -0.1378 │ -0.03318 │ -0.001078 │ 0.02792 │ 0.1356 │ ▁▃▇▇▃▁ │ │ │ │ target │ 0 │ 0 │ 152.1 │ 77.09 │ 25 │ 87 │ 140.5 │ 211.5 │ 346 │ ▇▇▇▅▅▁ │ │ │ └─────────┴─────┴──────┴────────────┴─────────┴──────────┴──────────┴───────────┴─────────┴─────────┴────────┘ │ ╰────────────────────────────────────────────────────── End ──────────────────────────────────────────────────────╯
diabetes_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 442 entries, 0 to 441 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 442 non-null float64 1 sex 442 non-null float64 2 bmi 442 non-null float64 3 bp 442 non-null float64 4 s1 442 non-null float64 5 s2 442 non-null float64 6 s3 442 non-null float64 7 s4 442 non-null float64 8 s5 442 non-null float64 9 s6 442 non-null float64 10 target 442 non-null float64 dtypes: float64(11) memory usage: 38.1 KB
plt.hist(diabetes_df["target"], bins=range(0, 400, 10))
plt.title("Target data")
plt.xlabel("Target")
plt.ylabel("Number of Data Points")
Text(0, 0.5, 'Number of Data Points')
print(
"Average no. for Target: "
+ str(np.mean(diabetes_df["target"]))
+ " Standard Dev: "
+ str(np.std(diabetes_df["target"]))
)
Average no. for Target: 152.13348416289594 Standard Dev: 77.00574586945044
plt.scatter(diabetes_df["bmi"], diabetes_df["target"], s=2)
plt.title("Target vs. BMI")
plt.xlabel("BMI")
plt.ylabel("Target")
Text(0, 0.5, 'Target')
est = ols(formula="target ~ bmi", data=diabetes_df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: target R-squared: 0.344 Model: OLS Adj. R-squared: 0.342 Method: Least Squares F-statistic: 230.7 Date: Sun, 18 May 2025 Prob (F-statistic): 3.47e-42 Time: 16:58:51 Log-Likelihood: -2454.0 No. Observations: 442 AIC: 4912. Df Residuals: 440 BIC: 4920. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 152.1335 2.974 51.162 0.000 146.289 157.978 bmi 949.4353 62.515 15.187 0.000 826.570 1072.301 ============================================================================== Omnibus: 11.674 Durbin-Watson: 1.848 Prob(Omnibus): 0.003 Jarque-Bera (JB): 7.310 Skew: 0.156 Prob(JB): 0.0259 Kurtosis: 2.453 Cond. No. 21.0 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(np.sqrt(est.mse_resid))
62.51512200285265
plt.scatter(diabetes_df["s3"], diabetes_df["target"], s=2)
plt.title("Target vs. s3")
plt.xlabel("s3")
plt.ylabel("Target")
Text(0, 0.5, 'Target')
est = ols(formula="target ~ s3", data=diabetes_df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: target R-squared: 0.156 Model: OLS Adj. R-squared: 0.154 Method: Least Squares F-statistic: 81.24 Date: Sun, 18 May 2025 Prob (F-statistic): 6.16e-18 Time: 16:59:10 Log-Likelihood: -2509.7 No. Observations: 442 AIC: 5023. Df Residuals: 440 BIC: 5032. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 152.1335 3.373 45.105 0.000 145.504 158.762 s3 -639.1453 70.911 -9.013 0.000 -778.512 -499.778 ============================================================================== Omnibus: 30.490 Durbin-Watson: 1.908 Prob(Omnibus): 0.000 Jarque-Bera (JB): 18.293 Skew: 0.353 Prob(JB): 0.000107 Kurtosis: 2.297 Cond. No. 21.0 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(np.sqrt(est.mse_resid))
70.91131523302553
# Calculate correlation matrix
correlation_matrix = diabetes_df.corr(numeric_only=True)
correlation_matrix
age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|
age | 1.000000 | 0.173737 | 0.185085 | 0.335428 | 0.260061 | 0.219243 | -0.075181 | 0.203841 | 0.270774 | 0.301731 | 0.187889 |
sex | 0.173737 | 1.000000 | 0.088161 | 0.241010 | 0.035277 | 0.142637 | -0.379090 | 0.332115 | 0.149916 | 0.208133 | 0.043062 |
bmi | 0.185085 | 0.088161 | 1.000000 | 0.395411 | 0.249777 | 0.261170 | -0.366811 | 0.413807 | 0.446157 | 0.388680 | 0.586450 |
bp | 0.335428 | 0.241010 | 0.395411 | 1.000000 | 0.242464 | 0.185548 | -0.178762 | 0.257650 | 0.393480 | 0.390430 | 0.441482 |
s1 | 0.260061 | 0.035277 | 0.249777 | 0.242464 | 1.000000 | 0.896663 | 0.051519 | 0.542207 | 0.515503 | 0.325717 | 0.212022 |
s2 | 0.219243 | 0.142637 | 0.261170 | 0.185548 | 0.896663 | 1.000000 | -0.196455 | 0.659817 | 0.318357 | 0.290600 | 0.174054 |
s3 | -0.075181 | -0.379090 | -0.366811 | -0.178762 | 0.051519 | -0.196455 | 1.000000 | -0.738493 | -0.398577 | -0.273697 | -0.394789 |
s4 | 0.203841 | 0.332115 | 0.413807 | 0.257650 | 0.542207 | 0.659817 | -0.738493 | 1.000000 | 0.617859 | 0.417212 | 0.430453 |
s5 | 0.270774 | 0.149916 | 0.446157 | 0.393480 | 0.515503 | 0.318357 | -0.398577 | 0.617859 | 1.000000 | 0.464669 | 0.565883 |
s6 | 0.301731 | 0.208133 | 0.388680 | 0.390430 | 0.325717 | 0.290600 | -0.273697 | 0.417212 | 0.464669 | 1.000000 | 0.382483 |
target | 0.187889 | 0.043062 | 0.586450 | 0.441482 | 0.212022 | 0.174054 | -0.394789 | 0.430453 | 0.565883 | 0.382483 | 1.000000 |
# Create a mask for correlations less than 0.5
mask = np.abs(correlation_matrix) < 0.5
sns.heatmap(diabetes_df.corr(), annot=True, mask=mask, cmap="coolwarm", center=0)
<Axes: >
plt.scatter(diabetes_df["sex"], diabetes_df["target"], s=2)
plt.title("Target vs. Sex [1=male, 0=female]")
plt.xlabel("Sex")
plt.ylabel("Target")
Text(0, 0.5, 'Target')
est = ols(formula="target ~ sex", data=diabetes_df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: target R-squared: 0.002 Model: OLS Adj. R-squared: -0.000 Method: Least Squares F-statistic: 0.8174 Date: Sun, 18 May 2025 Prob (F-statistic): 0.366 Time: 16:59:44 Log-Likelihood: -2546.8 No. Observations: 442 AIC: 5098. Df Residuals: 440 BIC: 5106. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 152.1335 3.668 41.479 0.000 144.925 159.342 sex 69.7154 77.109 0.904 0.366 -81.832 221.263 ============================================================================== Omnibus: 64.308 Durbin-Watson: 1.908 Prob(Omnibus): 0.000 Jarque-Bera (JB): 28.530 Skew: 0.436 Prob(JB): 6.38e-07 Kurtosis: 2.112 Cond. No. 21.0 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(np.sqrt(est.mse_resid))
77.10896796118259
est = ols(formula="target ~ age", data=diabetes_df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: target R-squared: 0.035 Model: OLS Adj. R-squared: 0.033 Method: Least Squares F-statistic: 16.10 Date: Sun, 18 May 2025 Prob (F-statistic): 7.06e-05 Time: 16:59:59 Log-Likelihood: -2539.2 No. Observations: 442 AIC: 5082. Df Residuals: 440 BIC: 5091. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 152.1335 3.606 42.192 0.000 145.047 159.220 age 304.1831 75.806 4.013 0.000 155.196 453.170 ============================================================================== Omnibus: 52.996 Durbin-Watson: 1.921 Prob(Omnibus): 0.000 Jarque-Bera (JB): 26.909 Skew: 0.438 Prob(JB): 1.43e-06 Kurtosis: 2.167 Cond. No. 21.0 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(np.sqrt(est.mse_resid))
75.80599912703144
est = ols(
formula="target ~ age + sex + bmi + bp + s1 + s2 + s3 + s4 + s5", data=diabetes_df
).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: target R-squared: 0.517 Model: OLS Adj. R-squared: 0.507 Method: Least Squares F-statistic: 51.29 Date: Sun, 18 May 2025 Prob (F-statistic): 8.68e-63 Time: 17:00:13 Log-Likelihood: -2386.5 No. Observations: 442 AIC: 4793. Df Residuals: 432 BIC: 4834. Df Model: 9 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 152.1335 2.576 59.058 0.000 147.070 157.197 age -1.9476 59.233 -0.033 0.974 -118.368 114.472 sex -235.2740 61.065 -3.853 0.000 -355.296 -115.252 bmi 530.1282 65.777 8.060 0.000 400.846 659.410 bp 334.9495 64.609 5.184 0.000 207.963 461.936 s1 -797.2829 416.674 -1.913 0.056 -1616.244 21.678 s2 482.3017 339.007 1.423 0.156 -184.006 1148.610 s3 106.8011 212.470 0.503 0.615 -310.802 524.404 s4 188.7791 161.080 1.172 0.242 -127.819 505.377 s5 767.0074 171.223 4.480 0.000 430.473 1103.541 ============================================================================== Omnibus: 1.193 Durbin-Watson: 2.039 Prob(Omnibus): 0.551 Jarque-Bera (JB): 1.183 Skew: 0.026 Prob(JB): 0.554 Kurtosis: 2.752 Cond. No. 227. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
est = ols(formula="target ~ sex + bmi + bp + s5", data=diabetes_df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: target R-squared: 0.487 Model: OLS Adj. R-squared: 0.482 Method: Least Squares F-statistic: 103.6 Date: Sun, 18 May 2025 Prob (F-statistic): 5.42e-62 Time: 17:00:23 Log-Likelihood: -2399.8 No. Observations: 442 AIC: 4810. Df Residuals: 437 BIC: 4830. Df Model: 4 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 152.1335 2.639 57.648 0.000 146.947 157.320 sex -136.7580 57.304 -2.387 0.017 -249.383 -24.132 bmi 598.2839 64.365 9.295 0.000 471.781 724.786 bp 292.9722 63.935 4.582 0.000 167.314 418.630 s5 554.4326 64.427 8.606 0.000 427.807 681.059 ============================================================================== Omnibus: 5.261 Durbin-Watson: 1.982 Prob(Omnibus): 0.072 Jarque-Bera (JB): 4.282 Skew: 0.145 Prob(JB): 0.118 Kurtosis: 2.614 Cond. No. 28.5 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
est = ols(formula="target ~ bmi + s5", data=diabetes_df).fit()
print(est.summary())
OLS Regression Results ============================================================================== Dep. Variable: target R-squared: 0.459 Model: OLS Adj. R-squared: 0.457 Method: Least Squares F-statistic: 186.6 Date: Sun, 18 May 2025 Prob (F-statistic): 2.25e-59 Time: 17:00:35 Log-Likelihood: -2411.2 No. Observations: 442 AIC: 4828. Df Residuals: 439 BIC: 4841. Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 152.1335 2.702 56.303 0.000 146.823 157.444 bmi 675.0714 63.475 10.635 0.000 550.318 799.825 s5 614.9499 63.475 9.688 0.000 490.197 739.703 ============================================================================== Omnibus: 7.406 Durbin-Watson: 1.919 Prob(Omnibus): 0.025 Jarque-Bera (JB): 5.552 Skew: 0.160 Prob(JB): 0.0623 Kurtosis: 2.554 Cond. No. 28.2 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(np.sqrt(est.mse_resid))
56.807512054914135
df_train, df_test = train_test_split(diabetes_df, test_size=0.3)
est_train = ols(formula="target ~ sex + bmi + bp + s5", data=df_train).fit()
print(est_train.summary())
OLS Regression Results ============================================================================== Dep. Variable: target R-squared: 0.483 Model: OLS Adj. R-squared: 0.476 Method: Least Squares F-statistic: 70.98 Date: Sun, 18 May 2025 Prob (F-statistic): 2.14e-42 Time: 17:01:06 Log-Likelihood: -1677.5 No. Observations: 309 AIC: 3365. Df Residuals: 304 BIC: 3384. Df Model: 4 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 152.2473 3.169 48.040 0.000 146.011 158.484 sex -168.8222 68.852 -2.452 0.015 -304.308 -33.336 bmi 581.0621 75.850 7.661 0.000 431.804 730.320 bp 329.1392 77.901 4.225 0.000 175.845 482.433 s5 574.0595 79.412 7.229 0.000 417.793 730.327 ============================================================================== Omnibus: 3.568 Durbin-Watson: 2.053 Prob(Omnibus): 0.168 Jarque-Bera (JB): 2.874 Skew: 0.120 Prob(JB): 0.238 Kurtosis: 2.594 Cond. No. 28.4 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(np.sqrt(est_train.mse_resid))
55.58467333439272
test_pred = est_train.predict(df_test)
print("OOS R-squared: " + str(r2_score(df_test["target"], test_pred)))
OOS R-squared: 0.4884507706169847
df_test.head()
age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|
189 | -0.001882 | -0.044642 | -0.066563 | 0.001215 | -0.002945 | 0.003070 | 0.011824 | -0.002592 | -0.020292 | -0.025930 | 79.0 |
37 | -0.009147 | -0.044642 | 0.011039 | -0.057313 | -0.024960 | -0.042963 | 0.030232 | -0.039493 | 0.017036 | -0.005220 | 276.0 |
4 | 0.005383 | -0.044642 | -0.036385 | 0.021872 | 0.003935 | 0.015596 | 0.008142 | -0.002592 | -0.031988 | -0.046641 | 135.0 |
225 | 0.030811 | 0.050680 | 0.032595 | 0.049415 | -0.040096 | -0.043589 | -0.069172 | 0.034309 | 0.063015 | 0.003064 | 208.0 |
77 | -0.096328 | -0.044642 | -0.036385 | -0.074527 | -0.038720 | -0.027618 | 0.015505 | -0.039493 | -0.074093 | -0.001078 | 200.0 |
test_pred.head()
189 109.857331 37 157.113898 4 127.478307 225 215.070221 77 71.578589 dtype: float64
x = df_test["target"]
y = test_pred
plt.scatter(x, y, s=2)
plt.title("Predicted Target vs. Actual Target")
plt.xlabel("Actual Target")
plt.ylabel("Predicted Target")
# Calculate linear regression line
slope, intercept = np.polyfit(x, y, 1)
y_pred = slope * x + intercept # y-values of the regression line
# Plot the regression line
plt.plot(x, y_pred, color="red", label=f"Line: y = {intercept:.2f} + {slope:.2f}x")
[<matplotlib.lines.Line2D at 0x2436ed05d10>]