import requests
import json
import pandas as pd
import numpy as np

# Retrieve data from Subject tables
subject_col_ids = ",".join(['NAME','S1701_C03_001E','S2701_C03_001E','S1901_C01_012E','S1501_C01_005E','S1501_C01_012E','S1501_C01_001E','S1501_C01_006E'])
url = 'https://api.census.gov/data/2023/acs/acs1/subject'
response = requests.get(f'{url}?get={subject_col_ids}&ucgid=pseudo(0100000US$0500000)&descriptive=true')
content = response.json()

# Store the raw data in local data store
with open('data1.json', 'w') as file:
    json.dump(content, file)

df1 = pd.read_json('data1.json')
df1.head()

# Retrieve data from Data Profiles tables
dp_col_ids = ",".join(['NAME','DP03_0009PE','DP03_0021PE'])
url = 'https://api.census.gov/data/2023/acs/acs1/profile'
response = requests.get(f'{url}?get={dp_col_ids}&ucgid=pseudo(0100000US$0500000)&descriptive=true')
content = response.json()

# Store the raw data in local data store
with open('data2.json', 'w') as file:
    json.dump(content, file)

df2 = pd.read_json('data2.json')
df2.head()

# Retrieve data from Detailed Base tables
db_col_ids = ",".join(['NAME','B19057_002E','B19057_001E','B25077_001E','B25064_001E'])
url = 'https://api.census.gov/data/2023/acs/acs1'
response = requests.get(f'{url}?get={db_col_ids}&ucgid=pseudo(0100000US$0500000)&descriptive=true')
content = response.json()

# Store the raw data in local data store
with open('data3.json', 'w') as file:
    json.dump(content, file)

df3 = pd.read_json('data3.json')
df3.head()

print(df1.shape)
print(df2.shape)
print(df3.shape)

(856, 9)
(856, 4)
(856, 6)

df1.drop(columns=[8], inplace=True)
df2.drop(columns=[3], inplace=True)

merged_df = df1.merge(df2, on=0).merge(df3, on=0)
merged_df.head()

merged_df.shape

(856, 15)

merged_df.to_csv('raw_data.csv', index=False)

cleaned_df = merged_df.copy()

cleaned_df.columns = cleaned_df.loc[0]
cleaned_df.drop(0, inplace=True)
cleaned_df.reset_index(drop=True, inplace=True)
cleaned_df.head(1)

# Check for the column labels
for i in cleaned_df.columns:
    label = cleaned_df.loc[0,i]
    print(f'{i}: {label}')

NAME: Geographic Area Name
S1701_C03_001E: Estimate!!Percent below poverty level!!Population for whom poverty status is determined
S2701_C03_001E: Estimate!!Percent Insured!!Civilian noninstitutionalized population
S1901_C01_012E: Estimate!!Households!!Median income (dollars)
S1501_C01_005E: Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 18 to 24 years!!Bachelor's degree or higher
S1501_C01_012E: Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Bachelor's degree
S1501_C01_001E: Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 18 to 24 years
S1501_C01_006E: Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over
DP03_0009PE: Percent!!EMPLOYMENT STATUS!!Civilian labor force!!Unemployment Rate
DP03_0021PE: Percent!!COMMUTING TO WORK!!Workers 16 years and over!!Public transportation (excluding taxicab)
B19057_002E: Estimate!!Total:!!With public assistance income
B19057_001E: Estimate!!Total:
B25077_001E: Estimate!!Median value (dollars)
B25064_001E: Estimate!!Median gross rent
ucgid: Geography

# Create column names
col_names = ['county','poverty_rate','health_insurance','median_household_income','bachelor_18-24','bachelor_25+','population_18-24','population_25+',
             'unemployment_rate','public_transit','with_public_assistance','total_households','median_house_value','median_gross_rent','ucgid']

# Apply column names
cleaned_df.columns = col_names

# Drop the column label row
cleaned_df.drop(0, inplace=True)
cleaned_df.reset_index(drop=True, inplace=True)
cleaned_df.head(2)

cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 854 entries, 0 to 853
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   county                   854 non-null    object
 1   poverty_rate             854 non-null    object
 2   health_insurance         854 non-null    object
 3   median_household_income  854 non-null    object
 4   bachelor_18-24           854 non-null    object
 5   bachelor_25+             854 non-null    object
 6   population_18-24         854 non-null    object
 7   population_25+           854 non-null    object
 8   unemployment_rate        854 non-null    object
 9   public_transit           854 non-null    object
 10  with_public_assistance   854 non-null    object
 11  total_households         854 non-null    object
 12  median_house_value       854 non-null    object
 13  median_gross_rent        854 non-null    object
 14  ucgid                    854 non-null    object
dtypes: object(15)
memory usage: 100.2+ KB

# Identify integer columns
int_cols = ['median_household_income','bachelor_18-24','bachelor_25+','population_18-24','population_25+',
            'with_public_assistance','total_households','median_house_value','median_gross_rent']
# Identify float columns
float_cols = ['poverty_rate','health_insurance','unemployment_rate','public_transit']
# Convert data type from object to int or float. The below codes will raise a ValueError if a column contains non-numeric characters.
for col in int_cols:
    cleaned_df[col] = pd.to_numeric(cleaned_df[col], errors='raise')
for col in float_cols:
    cleaned_df[col] = pd.to_numeric(cleaned_df[col], errors='raise')
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 854 entries, 0 to 853
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   county                   854 non-null    object 
 1   poverty_rate             854 non-null    float64
 2   health_insurance         854 non-null    float64
 3   median_household_income  854 non-null    int64  
 4   bachelor_18-24           854 non-null    int64  
 5   bachelor_25+             854 non-null    int64  
 6   population_18-24         854 non-null    int64  
 7   population_25+           854 non-null    int64  
 8   unemployment_rate        854 non-null    float64
 9   public_transit           854 non-null    float64
 10  with_public_assistance   854 non-null    int64  
 11  total_households         854 non-null    int64  
 12  median_house_value       854 non-null    int64  
 13  median_gross_rent        854 non-null    int64  
 14  ucgid                    854 non-null    object 
dtypes: float64(4), int64(9), object(2)
memory usage: 100.2+ KB

cleaned_df.isna().sum()

county                     0
poverty_rate               0
health_insurance           0
median_household_income    0
bachelor_18-24             0
bachelor_25+               0
population_18-24           0
population_25+             0
unemployment_rate          0
public_transit             0
with_public_assistance     0
total_households           0
median_house_value         0
median_gross_rent          0
ucgid                      0
dtype: int64

# Identify integer columns with at least one 0
for col in int_cols:
    if (cleaned_df[col] == 0).any():
        print(col)

bachelor_18-24
with_public_assistance

# Identify float columns with at least one 0
for col in float_cols:
    if (cleaned_df[col] == 0).any():
        print(col)

public_transit

# Identify rows where 0 occurs in 'bachelor_18-24'
cleaned_df[cleaned_df['bachelor_18-24'] == 0]

# Retrieve data in 2022
ucgid = ','.join(cleaned_df['ucgid'][cleaned_df['bachelor_18-24'] == 0])
url = f'https://api.census.gov/data/2022/acs/acs1/subject?get=NAME,S1501_C01_001E,S1501_C01_005E&ucgid={ucgid}'
data = requests.get(url).json()
crosscheck = pd.DataFrame(data,columns=['county','population_18-24','bachelor_18-24','ucgid'])
crosscheck

crosscheck.drop(0, inplace=True) # Remove the first row in "crosscheck"
# Convert datatype in "crosscheck" before calculation
crosscheck['bachelor_18-24'] = pd.to_numeric(crosscheck['bachelor_18-24'])
crosscheck['population_18-24'] = pd.to_numeric(crosscheck['population_18-24'])

county_list = []

for i in range(1,7):
    ratio_22 = crosscheck.loc[i,'bachelor_18-24']/crosscheck.loc[i,'population_18-24']
    
    county = crosscheck.loc[i,'county']
    population_23 = cleaned_df['population_18-24'][cleaned_df['county'] == county]
    
    impute_val = (population_23 * ratio_22).astype(int)
    
    cleaned_df.loc[cleaned_df['county'] == county, 'bachelor_18-24'] = impute_val
    county_list.append(county)

cleaned_df[['county','bachelor_18-24']][cleaned_df['county'].isin(county_list)]

# Identify rows where 0 occurs in 'with_public_assistance'
cleaned_df[cleaned_df['with_public_assistance'] == 0]

# Retrieve data in 2022
url = 'https://api.census.gov/data/2022/acs/acs1?get=NAME,B19057_002E,B19057_001E&ucgid=0500000US18011,0500000US45043'
data = requests.get(url).json()
crosscheck = pd.DataFrame(data,columns=['county','with_public_assistance','total_households','ucgid'])
crosscheck.drop(0, inplace=True) # Remove the first row which contains column IDs
# Convert datatype in "crosscheck"
crosscheck['with_public_assistance'] = crosscheck['with_public_assistance'].astype('int')
crosscheck['total_households'] = crosscheck['total_households'].astype('int')
crosscheck

# Impute 0 value for Boone County
ratio_22 = crosscheck.loc[1,'with_public_assistance']/crosscheck.loc[1,'total_households']
total_households_23 = cleaned_df['total_households'][cleaned_df['ucgid'] == '0500000US18011']
impute_val = (total_households_23 * ratio_22).astype(int)
cleaned_df.loc[cleaned_df['ucgid'] == '0500000US18011', 'with_public_assistance'] = impute_val
# Check the imputed value
cleaned_df[['county','with_public_assistance']][cleaned_df['ucgid'] == '0500000US18011']

cleaned_df[['county','public_transit','ucgid']][cleaned_df['public_transit'].isin([-666666666,-999999999,-888888888,-222222222,-333333333,-555555555])]

# Retrieve data in 2022
ucgid = ','.join(cleaned_df['ucgid'][cleaned_df['public_transit'] == -999999999])
url = f'https://api.census.gov/data/2022/acs/acs1/profile?get=NAME,DP03_0021PE&ucgid={ucgid}'
data = requests.get(url).json()
crosscheck = pd.DataFrame(data,columns=['county','public_transit','ucgid'])
crosscheck.drop(0, inplace=True)
crosscheck.reset_index(drop=True, inplace=True)
crosscheck['public_transit'] = pd.to_numeric(crosscheck['public_transit'])
crosscheck

# Impute annotation values with 2022 data
cleaned_df.loc[cleaned_df['ucgid'].isin(crosscheck['ucgid']),'public_transit'] = crosscheck['public_transit'].values
cleaned_df[['county','public_transit']][cleaned_df['ucgid'].isin(crosscheck['ucgid'])]

cleaned_df.describe()

cleaned_df.max()

county                     Yuma County, Arizona
poverty_rate                               53.9
health_insurance                           98.5
median_household_income                  174148
bachelor_18-24                           124490
bachelor_25+                            1581628
population_18-24                         853064
population_25+                          6857511
unemployment_rate                          19.8
public_transit                             52.5
with_public_assistance                   144731
total_households                        3450470
median_house_value                      1512200
median_gross_rent                          2797
ucgid                            0500000US72139
dtype: object

cleaned_df.drop('ucgid', axis=1, inplace=True)
cleaned_df.head(2)

# Create a new column for the total number of people aged 18+
population_18_over = cleaned_df['population_18-24'] + cleaned_df['population_25+']
# Create a new column for the total number of people aged 18+ with a bachelor's degree or more
bachelor_18_over = cleaned_df['bachelor_18-24'] + cleaned_df['bachelor_25+']

# Create a column for the percentage of bachelor's degree holders
cleaned_df['bachelor_holders'] = ((bachelor_18_over / population_18_over) * 100).round(2)
# Remove unnecessary columns
cleaned_df = cleaned_df.drop(columns=['population_18-24', 'population_25+', 'bachelor_18-24', 'bachelor_25+'])
cleaned_df.head(3)

# Create a column for the percentage of people receiving public assistance
cleaned_df['public_assistance'] = ((cleaned_df['with_public_assistance']/cleaned_df['total_households']) * 100).round(2)
# Remove unnecessary columns
cleaned_df = cleaned_df.drop(columns=['with_public_assistance','total_households'])
cleaned_df.head(3)

cleaned_df.to_csv('cleaned_data.csv', index=False)
# Check the saved CSV file
df = pd.read_csv('cleaned_data.csv')
df.head()

	0	1	2	3	4	5	6	7	8
0	NAME	S1701_C03_001E	S2701_C03_001E	S1901_C01_012E	S1501_C01_005E	S1501_C01_012E	S1501_C01_001E	S1501_C01_006E	ucgid
1	Geographic Area Name	Estimate!!Percent below poverty level!!Populat...	Estimate!!Percent Insured!!Civilian noninstitu...	Estimate!!Households!!Median income (dollars)	Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT...	Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT...	Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT...	Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT...	Geography
2	Baldwin County, Alabama	9.7	93.2	72915	2213	39152	15552	184852	0500000US01003
3	Calhoun County, Alabama	21.3	91.1	50780	605	10275	13499	78420	0500000US01015
4	Cullman County, Alabama	18.3	89.4	58923	661	7026	7223	64307	0500000US01043

	0	1	2	3
0	NAME	DP03_0009PE	DP03_0021PE	ucgid
1	Geographic Area Name	Percent!!EMPLOYMENT STATUS!!Civilian labor for...	Percent!!COMMUTING TO WORK!!Workers 16 years a...	Geography
2	DeKalb County, Alabama	3.8	0.0	0500000US01049
3	Baldwin County, Alabama	2.0	0.0	0500000US01003
4	Calhoun County, Alabama	5.4	0.0	0500000US01015

	0	1	2	3	4	5
0	NAME	B19057_002E	B19057_001E	B25077_001E	B25064_001E	ucgid
1	Geographic Area Name	Estimate!!Total:!!With public assistance income	Estimate!!Total:	Estimate!!Median value (dollars)	Estimate!!Median gross rent	Geography
2	Baldwin County, Alabama	618	105698	307000	1286	0500000US01003
3	Calhoun County, Alabama	766	46310	160900	782	0500000US01015
4	Cullman County, Alabama	483	35961	204000	788	0500000US01043

	0	1_x	2_x	3_x	4_x	5_x	6	7	1_y	2_y	1	2	3_y	4_y	5_y
0	NAME	S1701_C03_001E	S2701_C03_001E	S1901_C01_012E	S1501_C01_005E	S1501_C01_012E	S1501_C01_001E	S1501_C01_006E	DP03_0009PE	DP03_0021PE	B19057_002E	B19057_001E	B25077_001E	B25064_001E	ucgid
1	Geographic Area Name	Estimate!!Percent below poverty level!!Populat...	Estimate!!Percent Insured!!Civilian noninstitu...	Estimate!!Households!!Median income (dollars)	Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT...	Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT...	Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT...	Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT...	Percent!!EMPLOYMENT STATUS!!Civilian labor for...	Percent!!COMMUTING TO WORK!!Workers 16 years a...	Estimate!!Total:!!With public assistance income	Estimate!!Total:	Estimate!!Median value (dollars)	Estimate!!Median gross rent	Geography
2	Baldwin County, Alabama	9.7	93.2	72915	2213	39152	15552	184852	2.0	0.0	618	105698	307000	1286	0500000US01003
3	Calhoun County, Alabama	21.3	91.1	50780	605	10275	13499	78420	5.4	0.0	766	46310	160900	782	0500000US01015
4	Cullman County, Alabama	18.3	89.4	58923	661	7026	7223	64307	3.8	0.3	483	35961	204000	788	0500000US01043

	county	poverty_rate	health_insurance	median_household_income	bachelor_18-24	bachelor_25+	population_18-24	population_25+	unemployment_rate	public_transit	with_public_assistance	total_households	median_house_value	median_gross_rent	ucgid
0	Baldwin County, Alabama	9.7	93.2	72915	2213	39152	15552	184852	2.0	0.0	618	105698	307000	1286	0500000US01003
1	Calhoun County, Alabama	21.3	91.1	50780	605	10275	13499	78420	5.4	0.0	766	46310	160900	782	0500000US01015

A. Data Extraction¶

B. Data Wrangling¶

1. Tidiness Issue: Column headers are codes¶

2. Quality Issue: Incorrect data types¶

3. Quality Issue: Completeness Issues¶

4. Quality Issue: Inconsistency¶

5. Tidiness Issue: Unnecessary Variables¶

6. Tidiness Issue: Column headers are values, not variable names¶

7. Derived Variable Creation¶

7. Update Data Store¶

Column	Description	Original table	Variable code
county	County name and state	All tables	county
poverty_rate	Poverty rate in percentage	S1701	S1701_C03_001E
health_insurance	The percentage of people having health insurance	S2701	S2701_C03_001E
bachelor_18-24	The number of people aged 18–24 with a bachelor’s degree	S1501	S1501_C01_005E
bachelor_25+	The number of people aged 25 and over with a bachelor’s degree	S1501	S1501_C01_012E
population_18-24	The number of people aged 18–24	S1501	S1501_C01_001E
population_25+	The number of people aged 25 and over	S1501	S1501_C01_006E
unemployment_rate	The percentage of people that are unemployed	DP03	DP03_0009PE
public_transit	The percentage of workers who commute to work by public transits	DP03	DP03_0021PE
with_public_assistance	The number of households receiving public assistance income	B19057	B19057_002E
total_households	The total number of households	B19057	B19057_001E
median_house_value	Median house value in dollars	B25077	B25077_001E
median_gross_rent	Median gross rent in dollars	B25077	B25064_001E

	county	poverty_rate	health_insurance	median_household_income	bachelor_25+	population_18-24	population_25+	unemployment_rate	public_transit	with_public_assistance	total_households	median_house_value	median_gross_rent	ucgid
24	Cochise County, Arizona	15.9	90.2	52025	13568	9920	89455	6.3	0.0	1512	50919	235200	988	0500000US04003
38	Lonoke County, Arkansas	8.4	95.1	75327	8512	5355	52158	5.0	0.0	435	29388	189300	1065	0500000US05085
119	Columbia County, Florida	16.8	83.2	60812	5873	4627	53112	7.8	-999999999.0	511	27280	230300	998	0500000US12023
186	Spalding County, Georgia	13.9	85.2	56169	6052	6546	47107	3.1	0.0	885	26898	240400	1073	0500000US13255
200	Twin Falls County, Idaho	9.3	92.5	66643	8520	8634	62141	2.6	0.0	1157	35050	329000	1142	0500000US16083
277	McCracken County, Kentucky	15.6	95.4	60720	7386	4572	48018	1.3	-999999999.0	206	28934	179600	764	0500000US21145

	county	poverty_rate	health_insurance	median_household_income	bachelor_18-24	bachelor_25+	population_18-24	population_25+	unemployment_rate	public_transit	with_public_assistance	total_households	median_house_value	median_gross_rent	ucgid
225	Boone County, Indiana	2.7	96.0	103592	680	14078	5001	51586	0.8	0.0	0	29659	369300	1303	0500000US18011
656	Georgetown County, South Carolina	11.9	89.7	71313	340	9992	5044	49831	3.2	0.0	0	28720	322200	1517	0500000US45043

	county	public_transit	ucgid
37	Jefferson County, Arkansas	-999999999.0	0500000US05069
119	Columbia County, Florida	-999999999.0	0500000US12023
168	Effingham County, Georgia	-999999999.0	0500000US13103
187	Troup County, Georgia	-999999999.0	0500000US13285
188	Walker County, Georgia	-999999999.0	0500000US13295
277	McCracken County, Kentucky	-999999999.0	0500000US21145
562	Marion County, Ohio	-999999999.0	0500000US39101
733	Orange County, Texas	-999999999.0	0500000US48361
843	Arecibo Municipio, Puerto Rico	-999999999.0	0500000US72013
851	Toa Alta Municipio, Puerto Rico	-999999999.0	0500000US72135

	poverty_rate	health_insurance	median_household_income	bachelor_18-24	bachelor_25+	population_18-24	population_25+	unemployment_rate	public_transit	with_public_assistance	total_households	median_house_value	median_gross_rent
count	854.000000	854.000000	854.000000	854.000000	8.540000e+02	854.000000	8.540000e+02	854.000000	854.000000	854.000000	8.540000e+02	8.540000e+02	854.000000
mean	12.770843	92.588056	76512.345433	4446.871194	5.373878e+04	31078.072600	2.331190e+05	4.208431	1.528220	3132.231850	1.316149e+05	3.309796e+05	1258.251756
std	5.401620	3.849054	20022.532998	8536.502992	9.874003e+04	51766.925148	4.031053e+05	1.727863	4.359279	7422.502216	2.172988e+05	1.743939e+05	372.467925
min	2.000000	70.700000	16836.000000	14.000000	3.078000e+03	3531.000000	3.539600e+04	0.500000	0.000000	0.000000	1.900400e+04	5.270000e+04	447.000000
25%	9.100000	90.400000	62032.250000	724.500000	1.154025e+04	8554.750000	6.646075e+04	3.100000	0.100000	649.000000	3.809475e+04	2.173750e+05	983.000000
50%	12.000000	93.400000	72994.000000	1665.000000	2.244350e+04	15247.000000	1.132020e+05	4.000000	0.500000	1282.500000	6.376150e+04	2.909500e+05	1170.000000
75%	15.675000	95.400000	87054.000000	4476.000000	5.332825e+04	34961.500000	2.361080e+05	5.000000	1.300000	2725.500000	1.370802e+05	3.917750e+05	1478.750000
max	53.900000	98.500000	174148.000000	124490.000000	1.581628e+06	853064.000000	6.857511e+06	19.800000	52.500000	144731.000000	3.450470e+06	1.512200e+06	2797.000000