import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import scipy.stats as stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load the cleaned dataset
df = pd.read_csv('cleaned_data.csv')
df.head()

# Show the number of rows (counties)
df.shape[0]

854

# Retrieve state names
states = df['county'].apply(lambda x: x.split(','))
# Show the number of states included in the data
states.apply(lambda x: x[1]).nunique()

52

# Show how many counties in each state are included in the dataset
states.apply(lambda x: x[1]).value_counts()

county
Texas                   56
North Carolina          44
California              42
Florida                 41
Pennsylvania            40
Georgia                 39
New York                38
Ohio                    38
Virginia                30
Michigan                30
Indiana                 27
Wisconsin               25
Illinois                22
South Carolina          22
New Jersey              21
Tennessee               21
Washington              20
Alabama                 20
Louisiana               17
Missouri                17
Maryland                16
Minnesota               16
Oregon                  15
Kentucky                14
Massachusetts           12
Colorado                12
Mississippi             11
Oklahoma                11
Arkansas                11
Puerto Rico             11
Arizona                 10
Iowa                    10
New Mexico               9
Connecticut              9
Kansas                   8
New Hampshire            7
West Virginia            7
Utah                     7
Montana                  6
Maine                    6
Idaho                    6
Rhode Island             4
North Dakota             4
Hawaii                   4
South Dakota             3
Delaware                 3
Alaska                   3
Nebraska                 3
Wyoming                  2
Nevada                   2
District of Columbia     1
Vermont                  1
Name: count, dtype: int64

df.describe()

fig, axes = plt.subplots(1, 2, figsize=(12, 4))  # 1 row and 2 columns
# Create histogram
sns.histplot(df['poverty_rate'], ax=axes[0])
# Create boxplot
sns.boxplot(x=df['poverty_rate'], ax=axes[1])

fig.suptitle('Distribution of Poverty Rate')
plt.show()

# Check for rows containing outliers in poverty rate
Q1 = df['poverty_rate'].quantile(0.25)
Q3 = df['poverty_rate'].quantile(0.75)
IQR = Q3 - Q1
df[df['poverty_rate'] >= (Q3 + 1.5*IQR)]

# Remove counties in Puerto Rico
df = df[~df['county'].str.contains('Puerto Rico')]
df.shape

(843, 10)

# Observe the distribution of poverty rate again
sns.histplot(df['poverty_rate'])
plt.title('Distribution of Poverty Rate')
plt.show()

# Create a dataframe with independent and dependent variables only
df_num = df.drop('county', axis=1)

corr_matrix = df_num.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

# Identify independent variables
predictors = df_num.drop(columns='poverty_rate')

# Add constant term for intercept
predictors = sm.add_constant(predictors)  

# Calculate VIF
vif_data = pd.DataFrame()
vif_data['variable'] = predictors.columns
vif_data['VIF'] = [variance_inflation_factor(predictors.values, i) for i in range(predictors.shape[1])]
print(vif_data)

                  variable         VIF
0                    const  794.168036
1         health_insurance    1.461573
2  median_household_income    4.460533
3        unemployment_rate    1.414113
4           public_transit    1.344033
5       median_house_value    4.575705
6        median_gross_rent    5.933254
7         bachelor_holders    2.708681
8        public_assistance    1.369089

df_num.drop(columns='median_gross_rent', inplace=True)
df_num.head(1)

# Visualizing the relationships between features using pairplot
sns.pairplot(df_num)

<seaborn.axisgrid.PairGrid at 0x2687f2083e0>

# List of the predictor columns
predictors = df_num.drop(columns =['poverty_rate'])

# Create a figure with a 2x4 grid of subplots
fig, axes = plt.subplots(2, 4, figsize=(18, 8))
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Loop through each predictor and plot the scatter plot with regression line
for i, predictor in enumerate(predictors):
    sns.regplot(x=df[predictor], y=df['poverty_rate'], ax=axes[i], line_kws={'color': 'red'})
    axes[i].set_xlabel(predictor)
    axes[i].set_ylabel('Poverty Rate')
    axes[i].set_title(f'Poverty Rate vs {predictor}')
plt.tight_layout()
plt.show()

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Histogram
sns.histplot(df_num['median_household_income'], bins=50, ax=axes[0])
# Scatter plot
sns.regplot(x='median_household_income', y='poverty_rate', data=df_num, line_kws={"color": "red"}, ax=axes[1])
plt.show()

# Apply log transformation
df_num['log_median_income'] = np.log(df_num['median_household_income'])
df_num.drop(columns='median_household_income', inplace=True)

# Check the plots again after log transformation
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Histogram
sns.histplot(df_num['log_median_income'], bins=50, ax=axes[0])

# Scatter plot
sns.regplot(x='log_median_income', y='poverty_rate', data=df_num, line_kws={'color': 'red'}, ax=axes[1])
plt.show()

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Histogram
sns.histplot(df_num['public_transit'], bins=50, ax=axes[0])

# Scatter plot
sns.regplot(x='public_transit', y='poverty_rate', data=df_num, line_kws={'color': 'red'}, ax=axes[1])
plt.show()

# Apply log transformation
df_num['log_public_transit'] = np.log1p(df_num['public_transit'])
df_num.drop(columns='public_transit', inplace=True)

# Check the plots again after log transformation
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Histogram
sns.histplot(df_num['log_public_transit'], bins=50, ax=axes[0])

# Scatter plot
sns.regplot(x='log_public_transit', y='poverty_rate', data=df_num, line_kws={'color': 'red'}, ax=axes[1])
plt.show()

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Histogram
sns.histplot(df_num['median_house_value'], bins=50, ax=axes[0])

# Scatter plot
sns.regplot(x='median_house_value', y='poverty_rate', data=df_num, line_kws={'color': 'red'}, ax=axes[1])
plt.show()

# Apply square root transformation
df_num['sqrt_median_house_value'] = np.sqrt(df_num['median_house_value'])
df_num.drop(columns='median_house_value', inplace=True)

# Check the plots again after log transformation
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# Histogram
sns.histplot(df_num['sqrt_median_house_value'], bins=50, ax=axes[0])

# Scatter plot
sns.regplot(x='sqrt_median_house_value', y='poverty_rate', data=df_num, line_kws={'color': 'red'}, ax=axes[1])
plt.show()

# Save dataset with transformed predictors
df_num.to_csv('transformed_data.csv', index=False)
# Check the saved CSV file
df = pd.read_csv('transformed_data.csv')
df.head()

predictors = df_num.drop(columns='poverty_rate')
# Apply scaling
scaler = MinMaxScaler()
scaled_predictors = scaler.fit_transform(predictors)
scaled_predictors = pd.DataFrame(scaled_predictors, columns=predictors.columns)
scaled_predictors.head()

# Add intercept to scaled predictors and specify the dependent variable
X = sm.add_constant(scaled_predictors)
y = df_num['poverty_rate']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
ols = sm.OLS(y_train, X_train)
model = ols.fit()

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate R-squared and MSE
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f'R-squared: {r2}')
print(f'Mean Squared Error: {mse}')

R-squared: 0.7960750041154723
Mean Squared Error: 4.355126198792516

# View model summary
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           poverty_rate   R-squared:                       0.763
Model:                            OLS   Adj. R-squared:                  0.761
Method:                 Least Squares   F-statistic:                     307.1
Date:                Wed, 04 Jun 2025   Prob (F-statistic):          1.04e-203
Time:                        00:29:00   Log-Likelihood:                -1513.3
No. Observations:                 674   AIC:                             3043.
Df Residuals:                     666   BIC:                             3079.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
===========================================================================================
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                      20.8661      0.641     32.554      0.000      19.608      22.125
health_insurance           -2.7029      0.710     -3.807      0.000      -4.097      -1.309
unemployment_rate           6.7305      0.769      8.753      0.000       5.221       8.240
bachelor_holders            1.2994      0.854      1.521      0.129      -0.378       2.977
public_assistance           0.1061      0.838      0.127      0.899      -1.540       1.752
log_median_income         -26.6265      1.060    -25.125      0.000     -28.707     -24.546
log_public_transit          5.6042      0.698      8.027      0.000       4.233       6.975
sqrt_median_house_value     4.7093      1.205      3.908      0.000       2.343       7.075
==============================================================================
Omnibus:                       53.971   Durbin-Watson:                   1.804
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              121.840
Skew:                           0.456   Prob(JB):                     3.49e-27
Kurtosis:                       4.873   Cond. No.                         25.0
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

with open("outputs/base_model_summary.html", "w") as f:
    f.write(model.summary().as_html())

# Residuals vs fitted values plot
residuals = y_test - y_pred
plt.scatter(y_pred, residuals, alpha=0.7)
plt.axhline(0, color='red')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted Values')
plt.savefig("outputs/base_residuals_fitted.png", dpi=300)
plt.show()

fig, axes = plt.subplots(2, 4, figsize=(18,8))
axes = axes.flatten()
predictors = X_test.drop(columns = 'const')
for i, col in enumerate(predictors.columns):
    axes[i].scatter(predictors[col], residuals, alpha=0.7)
    axes[i].axhline(0, color='r')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Residuals')
fig.savefig("outputs/base_residuals_predictors.png", dpi=300)
plt.show()

fig, axes = plt.subplots(1, 2, figsize=(12,4))
# Histogram of residuals
sns.histplot(residuals, bins=20, edgecolor='black', ax=axes[0])
axes[0].set_title('Histogram of Residuals')
axes[0].set_xlabel('Residuals')
axes[0].set_ylabel('Frequency')

# Boxplot of residuals
sns.boxplot(residuals, ax=axes[1])
axes[1].set_title('Boxplot of Residuals')   
axes[1].set_xlabel('Residuals')
fig.savefig("outputs/base_residuals_normality.png", dpi=300)
plt.show()

# Q-Q plot to check for normality of residuals
stats.probplot(residuals, dist='norm', plot=plt)
plt.savefig("outputs/base_qq.png", dpi=300)
plt.show()

# Apply transformation
df_num['sqrt_poverty_rate'] = np.sqrt(df_num['poverty_rate'])

fig, axes = plt.subplots(1, 2, figsize=(12,4))
# Compare the distribution of poverty rate before and after transformation
sns.histplot(df_num['poverty_rate'], ax=axes[0])
sns.histplot(df_num['sqrt_poverty_rate'], ax=axes[1])

plt.show()

# Bin sqrt_median_house_value into categories
bins = [0, 600, 1300]
labels = ['Low', 'High']
house_value_bin = pd.cut(df_num['sqrt_median_house_value'], bins=bins, labels=labels)

sns.scatterplot(x='log_median_income', y='poverty_rate', hue=house_value_bin, data=df_num, alpha=0.5)
plt.savefig("outputs/interaction_term.png", dpi=300)
plt.show()

df_num['income_x_house_value'] = (df_num['log_median_income'] * df_num['sqrt_median_house_value'])

predictors = df_num.drop(columns=['sqrt_poverty_rate', 'poverty_rate'])
# Apply scaling
scaler = MinMaxScaler()
scaled_predictors = scaler.fit_transform(predictors)
scaled_predictors = pd.DataFrame(scaled_predictors, columns=predictors.columns)
scaled_predictors.head()

# Add intercept to scaled explanatory data and specify the dependent variable
X = sm.add_constant(scaled_predictors)
y = df_num['sqrt_poverty_rate']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model with HC3 robust standard errors
ols = sm.OLS(y_train, X_train)
refined_model = ols.fit(cov_type='HC3')

# Make predictions on the test set
y_pred = refined_model.predict(X_test)

# Calculate R-squared and MSE
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f'R-squared: {r2}')
print(f'Mean Squared Error: {mse}')

R-squared: 0.8310581036269753
Mean Squared Error: 0.07192225315999502

# View model summary
print(refined_model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      sqrt_poverty_rate   R-squared:                       0.796
Model:                            OLS   Adj. R-squared:                  0.793
Method:                 Least Squares   F-statistic:                     312.2
Date:                Wed, 04 Jun 2025   Prob (F-statistic):          2.10e-219
Time:                        00:29:01   Log-Likelihood:                -139.94
No. Observations:                 674   AIC:                             297.9
Df Residuals:                     665   BIC:                             338.5
Df Model:                           8                                         
Covariance Type:                  HC3                                         
===========================================================================================
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                       5.0828      0.125     40.570      0.000       4.837       5.328
health_insurance           -0.3674      0.107     -3.445      0.001      -0.576      -0.158
unemployment_rate           0.8111      0.109      7.411      0.000       0.597       1.026
bachelor_holders            0.1332      0.116      1.149      0.251      -0.094       0.360
public_assistance           0.0735      0.120      0.613      0.540      -0.162       0.309
log_median_income          -5.2078      0.252    -20.651      0.000      -5.702      -4.713
log_public_transit          0.7545      0.095      7.940      0.000       0.568       0.941
sqrt_median_house_value   -18.3796      3.244     -5.666      0.000     -24.738     -12.021
income_x_house_value       20.4523      3.399      6.018      0.000      13.791      27.113
==============================================================================
Omnibus:                       16.933   Durbin-Watson:                   1.854
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               34.163
Skew:                          -0.042   Prob(JB):                     3.82e-08
Kurtosis:                       4.100   Cond. No.                         677.
==============================================================================

Notes:
[1] Standard Errors are heteroscedasticity robust (HC3)

with open("outputs/final_model_summary.html", "w") as f:
    f.write(refined_model.summary().as_html())

# Visualize standardized coefficients 
coeff = refined_model.params
coeff = coeff.iloc[(coeff.abs()*-1.0).argsort()]
sns.barplot(x=coeff.values, y=coeff.index, orient='h', hue=coeff.index)
plt.title('Tornado Diagram: Standardized Coefficients')
plt.xlabel('Standardized Coefficient')
plt.ylabel('Predictors')
plt.savefig("outputs/tornado_diagram.png", dpi=300, bbox_inches='tight')
plt.show()

# Residuals vs fitted values plot
residuals = y_test - y_pred
plt.scatter(y_pred, residuals, alpha=0.7)
plt.axhline(0, color='red')
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted Values')
plt.savefig("outputs/final_residuals_fitted.png", dpi=300)
plt.show()

fig, axes = plt.subplots(2, 4, figsize=(18,8))
axes = axes.flatten()
predictors = X_test.drop(columns = 'const')
for i, col in enumerate(predictors.columns):
    axes[i].scatter(predictors[col], residuals, alpha=0.7)
    axes[i].axhline(0, color='r')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Residuals')
plt.tight_layout()
fig.savefig("outputs/final_residuals_predictors.png", dpi=300)
plt.show()

fig, axes = plt.subplots(1, 2, figsize=(12,4))
# Histogram of residuals
sns.histplot(residuals, bins=20, edgecolor='black', ax=axes[0])
axes[0].set_title('Histogram of Residuals')
axes[0].set_xlabel('Residuals')
axes[0].set_ylabel('Frequency')

# Boxplot of residuals
sns.boxplot(residuals, ax=axes[1])
axes[1].set_title('Boxplot of Residuals')   
axes[1].set_xlabel('Residuals')
fig.savefig("outputs/final_residuals_normality.png", dpi=300)
plt.show()

# Q-Q plot to check for normality of residuals
stats.probplot(residuals, dist='norm', plot=plt)
plt.savefig("outputs/final_qq.png", dpi=300)
plt.show()

	county	poverty_rate	health_insurance	median_household_income	unemployment_rate	public_transit	median_house_value	median_gross_rent	bachelor_holders	public_assistance
0	Baldwin County, Alabama	9.7	93.2	72915	2.0	0.0	307000	1286	20.64	0.58
1	Calhoun County, Alabama	21.3	91.1	50780	5.4	0.0	160900	782	11.84	1.65
2	Cullman County, Alabama	18.3	89.4	58923	3.8	0.3	204000	788	10.75	1.34
3	DeKalb County, Alabama	24.7	86.1	43509	3.8	0.0	156800	659	9.33	2.66
4	Elmore County, Alabama	12.9	92.4	72478	2.7	0.0	229900	933	17.73	0.17

	poverty_rate	health_insurance	median_household_income	unemployment_rate	public_transit	median_house_value	median_gross_rent	bachelor_holders	public_assistance
count	854.000000	854.000000	854.000000	854.000000	854.000000	8.540000e+02	854.000000	854.000000	854.000000
mean	12.770843	92.588056	76512.345433	4.208431	1.528220	3.309796e+05	1258.251756	19.458888	2.139461
std	5.401620	3.849054	20022.532998	1.727863	4.359279	1.743939e+05	372.467925	5.772340	1.256567
min	2.000000	70.700000	16836.000000	0.500000	0.000000	5.270000e+04	447.000000	6.370000	0.000000
25%	9.100000	90.400000	62032.250000	3.100000	0.100000	2.173750e+05	983.000000	15.102500	1.280000
50%	12.000000	93.400000	72994.000000	4.000000	0.500000	2.909500e+05	1170.000000	19.145000	1.860000
75%	15.675000	95.400000	87054.000000	5.000000	1.300000	3.917750e+05	1478.750000	22.907500	2.730000
max	53.900000	98.500000	174148.000000	19.800000	52.500000	1.512200e+06	2797.000000	39.470000	10.550000

	county	poverty_rate	health_insurance	median_household_income	unemployment_rate	public_transit	median_house_value	median_gross_rent	bachelor_holders	public_assistance
23	Apache County, Arizona	29.8	86.0	40539	11.2	0.3	52700	607	8.53	2.97
28	Navajo County, Arizona	25.6	87.1	50754	8.1	0.6	241400	869	10.78	3.45
155	Bulloch County, Georgia	25.6	89.9	54007	7.0	1.3	240200	927	16.14	1.06
166	Dougherty County, Georgia	26.1	88.2	45769	11.3	0.7	144500	856	7.67	1.31
386	Lauderdale County, Mississippi	25.6	90.2	50542	7.6	0.0	145600	840	11.31	0.82
449	McKinley County, New Mexico	38.2	81.1	40108	10.9	0.2	71000	768	6.37	1.61
456	Bronx County, New York	27.9	93.3	46838	9.3	52.5	498200	1352	14.49	10.55
526	Robeson County, North Carolina	28.1	87.1	40996	4.7	0.2	87100	786	8.84	0.36
720	Hidalgo County, Texas	27.2	74.7	53661	5.6	0.2	151200	936	12.82	1.94
740	Starr County, Texas	29.0	74.5	41566	7.0	0.0	89600	747	8.33	2.07
771	Montgomery County, Virginia	26.0	96.0	68079	4.2	2.6	303500	1241	16.82	1.37
843	Arecibo Municipio, Puerto Rico	34.9	96.0	25496	12.1	0.0	114600	536	24.10	2.73
844	Bayamón Municipio, Puerto Rico	32.7	95.1	30660	6.4	1.8	144900	662	25.50	1.87
845	Caguas Municipio, Puerto Rico	35.4	93.6	30589	8.5	0.5	152900	590	22.58	4.38
848	Mayagüez Municipio, Puerto Rico	53.9	96.1	16836	19.8	2.5	107000	447	15.54	4.07
849	Ponce Municipio, Puerto Rico	50.4	96.4	18889	17.2	0.3	100600	526	20.16	2.45
850	San Juan Municipio, Puerto Rico	38.5	90.4	27403	9.1	4.7	175300	607	24.36	2.06
851	Toa Alta Municipio, Puerto Rico	36.1	91.7	31635	5.0	0.0	176800	566	22.46	1.02
852	Toa Baja Municipio, Puerto Rico	29.8	93.9	31814	5.2	0.4	148100	772	20.36	2.41

	poverty_rate	health_insurance	unemployment_rate	bachelor_holders	public_assistance	log_median_income	log_public_transit	sqrt_median_house_value
0	9.7	93.2	2.0	20.64	0.58	11.197050	0.000000	554.075807
1	21.3	91.1	5.4	11.84	1.65	10.835258	0.000000	401.123422
2	18.3	89.4	3.8	10.75	1.34	10.983987	0.262364	451.663592
3	24.7	86.1	3.8	9.33	2.66	10.680723	0.000000	395.979797
4	12.9	92.4	2.7	17.73	0.17	11.191038	0.000000	479.478884

	health_insurance	unemployment_rate	bachelor_holders	public_assistance	log_median_income	log_public_transit	sqrt_median_house_value
0	0.809353	0.131579	0.431118	0.054976	0.407074	0.000000	0.324462
1	0.733813	0.429825	0.165257	0.156398	0.160677	0.000000	0.171533
2	0.672662	0.289474	0.132326	0.127014	0.261968	0.065926	0.222065
3	0.553957	0.289474	0.089426	0.252133	0.055432	0.000000	0.166390
4	0.780576	0.192982	0.343202	0.016114	0.402980	0.000000	0.249876

A. Exploratory Data Analysis¶

1. Sample size¶

2. Data distribution¶

3. Correlation Between Variables¶

3.1 Multicolinearity¶

3.2 Linearity¶

B. Feature Scaling¶

C. Building Regression Model (Baseline Model)¶

D. Model Evaluation¶

E. Model Diagnostics (Residual Analysis)¶

1. Constant Variance (Homoscedasticity)¶

2. Linearity¶

3. Normality¶

F. Model Refinement¶

1. Transforming Dependent Variable¶

2. Interaction Term¶

3. Fitting a New Model with Robust Standard Errors¶

3.1 Scaling:¶

3.2 New model:¶

4. New model evaluation¶

5. Residual Analysis¶

G. Conclusion¶