Prediction: We are trying to predict how a Texas district will vote given its population data.
Thesis: Showing how district demographic features determine what share of the district votes Republican.
To obtain our dependent variable, we used requests.get and pandas.read_html on the election result pages of the 2018 Midterm Elections from the Texas Secretary of State's website.
We cleaned and exported the raw html into a CSV for each race. We were then able to loop through the CSVs to extract the exact data we needed: Repulican vote count and total votes.
races_list = list(range(0,38))
def tx_race_csv_creator(num_list):
for race_num in num_list:
if race_num > 0 and race_num < 37:
pull = requests.get('https://enrpages.sos.state.tx.us/public/nov06_331_race'+str(race_num)+'.htm')
df_raw = pd.read_html(pull.text)[0]
if len(df_raw.columns) == 7:
df_raw.columns = column_header_lib
elif len(df_raw.columns) == 6:
df_raw.columns = column_header_no_lib
elif len(df_raw.columns) == 8:
df_raw.columns = column_header_lib_ind
else:
return print("ERROR! Invalid Column Header")
df_remove_early = df_raw[df_raw["County"]!="Early"]
df = df_remove_early.drop(df_remove_early.index[0:3])
df.to_csv('/Users/jim_jack/Documents/mod_2_project/mod_2_project_ds102218/results_csvs/TX_'+str(race_num)+'.csv')
elif race_num == 0:
pull = requests.get('https://enrpages.sos.state.tx.us/public/nov06_331_race'+str(race_num)+'.htm')
df_raw = pd.read_html(pull.text)[0]
df_raw.columns = column_header
df_remove_early = df_raw[df_raw["County"]!="Early"]
df = df_remove_early.drop(df_remove_early.index[0:3])
df.to_csv('/Users/jim_jack/Documents/mod_2_project/mod_2_project_ds102218/results_csvs/US_Sen.csv')
else:
pull = requests.get('https://enrpages.sos.state.tx.us/public/nov06_331_race'+str(race_num)+'.htm')
df_raw = pd.read_html(pull.text)[0]
df_raw.columns = column_header
df_remove_early = df_raw[df_raw["County"]!="Early"]
df = df_remove_early.drop(df_remove_early.index[0:3])
df.to_csv('/Users/jim_jack/Documents/mod_2_project/mod_2_project_ds102218/results_csvs/TX_Gov.csv')
print("Created csv for Race Number: "+str(race_num))
if race_num == num_list[-1]:
print("All Results Data Updated")
We took the extracted data and put it into its own Pandas dataframe and calculated the share of Republican votes for the senate, gubernatorial and each congressional election.
def create_summary_df():
path = '/Users/jim_jack/Documents/mod_2_project/mod_2_project_ds102218/results_csvs'
allFiles = glob.glob(path + "/*.csv")
df_summary = []
for data in allFiles:
df_dict = {}
local_df = pd.read_csv(data)
df_dict['Race'] = file_name_finder(data)
if "REP" in local_df.columns:
df_dict['REP'] = local_df.iloc[0]["REP"]
else:
df_dict["REP"] = 0
df_dict['Total Votes'] = local_df.iloc[0]["Total Votes"]
df_summary.append(df_dict)
return pd.DataFrame(df_summary)
For our study, we thought about what metrics would lead to a particular district vote Republican.
- Proportion of district that is of the baby-boomer generation and older
- The baby-boomer generation is defined (on Wikipedia) as people born in the year 1964 and later. Since the raw data given gives us the number of people in each age group, we took the age group 45 to 54 and multiplied that by 0.2, as a fifth of the people in that group would be considered 'baby boomers' in the year 2017
- We also took this proportion out of people aged 18 and up, because we wanted to only look at people eligible to vote
- Proportion of the district that is White
- Proportion of the district that is male
- Proportion of the district that was born in Texas
- We took this proportion out of the number of people in the district that were born in the United States
- Mean income
- Education attainment
- We looked at the proportion of the district that achieved an education past high school -- including Associate's degrees, Bachelor's degrees, and higher
- Unemployment rate
- Buc-ee's: our categorical variable
We have a function for each new column! (with the exception of mean income, that we pull directly from the cleaned table)
def babyboomers(row):
teens = [int(row['15 to 19 years'])*0.4]
youngbbbs = [int(row['45 to 54 years'])*0.2]
agegroups = ['20 to 24 years','25 to 34 years','35 to 44 years','45 to 54 years','55 to 59 years',
'60 to 64 years','65 to 74 years','75 to 84 years','85 years and over']
popn = sum(teens + list(map(lambda a: int(row[a]), agegroups)))
age_groups = ['55 to 59 years', '60 to 64 years','65 to 74 years','75 to 84 years','85 years and over']
return sum(youngbbbs + list(map(lambda a: int(row[a]), age_groups)))/popn
def prop_white(row):
return int(row['White'])/int(row['Total population'].head(1))
def prop_male(row):
return int(row['Male'])/int(row['Total population'].head(1))
def texas_born(row):
return int(row['State of residence'])/int(row['Born in United States'])
def higheredu(row):
return (int(row["Associate's degree"])+int(row["Bachelor's degree"])+ \
int(row["Graduate or professional degree"]))/int(row["Population 25 years and over"])
def unemployment(row):
return int(row['Unemployed'])/int(row['Civilian labor force'].head(1))
import pandas as pd
import numpy as np
texas_raw = pd.read_csv('Texas_District_all.csv')
texas_raw.head()
labelled = texas_raw.set_index('Title') # setting the index to the metric measured
table_headers = list(map(lambda x: 'District 0'+str(x)+' Estimate', range(1, 10))) + \
list(map(lambda x: 'District '+str(x)+' Estimate', range(10, 37))) # new headers
cleaned = labelled.loc[:, table_headers].transpose().dropna(axis=1) # dropping the NaN columns
df = pd.DataFrame(index = cleaned.index)
# scraping for districts with Buc-ee's
from bs4 import BeautifulSoup
import requests
import re
url = 'https://www.buc-ees.com/locations.php'
html = requests.get(url)
soup = BeautifulSoup(html.content, 'html.parser')
locationsoup = soup.findAll({'div': 'container'})[13:40]
loc = []
for location in locationsoup:
loc += location.findAll('p')[-1].contents
addresses = list(filter(lambda l: 'TX' in l, loc))
districts = [22, 14, 10, 36, 14, 10, 26, 10, 27, 26, 14, 10, 27, 10, 14, 22, 27, 8, 35, 22, 27, 22, 31, 5, 14, 10, 27]
list(sorted(set(districts)))
[5, 8, 10, 14, 22, 26, 27, 31, 35, 36]
# adding columns into our new dataframe!
df['older_proportion'] = cleaned.apply(lambda row: babyboomers(row),axis=1)
df['proportion_white'] = cleaned.apply(lambda row: prop_white(row),axis=1)
df['proportion_male'] = cleaned.apply(lambda row: prop_male(row),axis=1)
df['born_in_TX'] = cleaned.apply(lambda row: texas_born(row),axis=1)
df['mean_income'] = pd.to_numeric(cleaned['Mean household income (dollars)'])
df['higher_education'] = cleaned.apply(lambda row: higheredu(row),axis=1)
df['unemployment_rate'] = cleaned.apply(lambda row: unemployment(row),axis=1)
beaver_locations = [0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1]
df["bucees"] = np.array(beaver_locations)
df.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
older_proportion | proportion_white | proportion_male | born_in_TX | mean_income | higher_education | unemployment_rate | bucees | |
---|---|---|---|---|---|---|---|---|
District 01 Estimate | 0.421819 | 0.770341 | 0.488964 | 0.773983 | 66809 | 0.297772 | 0.060640 | 0 |
District 02 Estimate | 0.352724 | 0.676836 | 0.500970 | 0.651026 | 108618 | 0.490439 | 0.042236 | 0 |
District 03 Estimate | 0.338312 | 0.685146 | 0.490954 | 0.555650 | 122057 | 0.612976 | 0.036385 | 0 |
District 04 Estimate | 0.426503 | 0.812625 | 0.491080 | 0.725613 | 74581 | 0.315266 | 0.049219 | 0 |
District 05 Estimate | 0.373478 | 0.773341 | 0.502425 | 0.769360 | 69802 | 0.265619 | 0.047108 | 1 |
Based on the correlation table below, we can see that the three variables mean income, higher education and born in Texas are very highly correlated with each other. Therefore, we decide to drop the former two.
df.corr()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
older_proportion | proportion_white | proportion_male | born_in_TX | mean_income | higher_education | unemployment_rate | bucees | |
---|---|---|---|---|---|---|---|---|
older_proportion | 1.000000 | 0.489234 | -0.052985 | -0.074147 | 0.107922 | 0.020705 | -0.267486 | 0.113633 |
proportion_white | 0.489234 | 1.000000 | 0.305883 | 0.183913 | -0.134012 | -0.177832 | -0.167455 | 0.064180 |
proportion_male | -0.052985 | 0.305883 | 1.000000 | 0.239094 | -0.151797 | -0.301555 | 0.054639 | 0.175571 |
born_in_TX | -0.074147 | 0.183913 | 0.239094 | 1.000000 | -0.880158 | -0.896623 | 0.688006 | -0.185320 |
mean_income | 0.107922 | -0.134012 | -0.151797 | -0.880158 | 1.000000 | 0.933985 | -0.588324 | 0.223564 |
higher_education | 0.020705 | -0.177832 | -0.301555 | -0.896623 | 0.933985 | 1.000000 | -0.649680 | 0.121578 |
unemployment_rate | -0.267486 | -0.167455 | 0.054639 | 0.688006 | -0.588324 | -0.649680 | 1.000000 | -0.020370 |
bucees | 0.113633 | 0.064180 | 0.175571 | -0.185320 | 0.223564 | 0.121578 | -0.020370 | 1.000000 |
final_independent = df.drop(['mean_income', 'higher_education'], axis=1)
final_independent.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
older_proportion | proportion_white | proportion_male | born_in_TX | unemployment_rate | bucees | |
---|---|---|---|---|---|---|
District 01 Estimate | 0.421819 | 0.770341 | 0.488964 | 0.773983 | 0.060640 | 0 |
District 02 Estimate | 0.352724 | 0.676836 | 0.500970 | 0.651026 | 0.042236 | 0 |
District 03 Estimate | 0.338312 | 0.685146 | 0.490954 | 0.555650 | 0.036385 | 0 |
District 04 Estimate | 0.426503 | 0.812625 | 0.491080 | 0.725613 | 0.049219 | 0 |
District 05 Estimate | 0.373478 | 0.773341 | 0.502425 | 0.769360 | 0.047108 | 1 |
We output all our variables into a CSV file for easy access. In preperation for our regression, we also output a scatter matrix and heatmap to visualize how the independent variables interact with each other and the dependent variable.
data = pd.read_csv('full_data.csv')
data.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Unnamed: 0 | older_proportion | proportion_white | proportion_male | born_in_TX | unemployment_rate | bucees | |
---|---|---|---|---|---|---|---|
0 | District 01 Estimate | 0.421819 | 0.770341 | 0.488964 | 0.773983 | 0.060640 | 0 |
1 | District 02 Estimate | 0.352724 | 0.676836 | 0.500970 | 0.651026 | 0.042236 | 0 |
2 | District 03 Estimate | 0.338312 | 0.685146 | 0.490954 | 0.555650 | 0.036385 | 0 |
3 | District 04 Estimate | 0.426503 | 0.812625 | 0.491080 | 0.725613 | 0.049219 | 0 |
4 | District 05 Estimate | 0.373478 | 0.773341 | 0.502425 | 0.769360 | 0.047108 | 1 |
import matplotlib.pyplot as plt
pd.plotting.scatter_matrix(data, figsize=[12, 12])
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1c1b868be0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1b7cfdd8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1b96fba8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bb4e400>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bb6fb00>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bb6fb38>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bbc52e8>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bbea9e8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bc1a128>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bc3f828>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bc65f28>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bc95668>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bcbbd68>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bceb4a8>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bd11ba8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bd402e8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bd699e8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bd9a128>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bdbe828>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bde7f28>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1be17668>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x1c1be3dd68>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1be6e4a8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1be95ba8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bec32e8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1beeb9e8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bf1e128>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bf44828>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bf6bf28>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bf9a668>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bfc0d68>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1bff24a8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1c017ba8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1c0472e8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1b86d518>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x1c1b822f28>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1b916080>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1c0d2588>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1c0ff048>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1c125ac8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1c152588>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1c183048>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x1c1c1a8ac8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1c1d7588>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1c20a048>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1c22dac8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1c25e588>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1c28d048>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1c2b2ac8>]],
dtype=object)
import seaborn as sns
corr = data.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title('Heatmap of Correlation between Features')
Text(0.5,1,'Heatmap of Correlation between Features')
data.corr()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
older_proportion | proportion_white | proportion_male | born_in_TX | unemployment_rate | bucees | |
---|---|---|---|---|---|---|
older_proportion | 1.000000 | 0.489234 | -0.052985 | -0.074147 | -0.267486 | 0.113633 |
proportion_white | 0.489234 | 1.000000 | 0.305883 | 0.183913 | -0.167455 | 0.064180 |
proportion_male | -0.052985 | 0.305883 | 1.000000 | 0.239094 | 0.054639 | 0.175571 |
born_in_TX | -0.074147 | 0.183913 | 0.239094 | 1.000000 | 0.688006 | -0.185320 |
unemployment_rate | -0.267486 | -0.167455 | 0.054639 | 0.688006 | 1.000000 | -0.020370 |
bucees | 0.113633 | 0.064180 | 0.175571 | -0.185320 | -0.020370 | 1.000000 |
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.model_selection import train_test_split, LeaveOneOut, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn import pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn import datasets, linear_model
from sklearn import metrics
from sklearn import model_selection
We first add our dependent variable to the table of independent variables.
df_election_data = pd.read_csv('election_data.csv')
df_district_results = df_election_data.iloc[0:-2]
df_district_results.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Race | REP | Total Votes | prop_r_vote | |
---|---|---|---|---|
0 | TX_01 | 167734 | 231969 | 0.723088 |
1 | TX_02 | 139012 | 262924 | 0.528716 |
2 | TX_03 | 168775 | 311070 | 0.542563 |
3 | TX_04 | 188003 | 248373 | 0.756938 |
4 | TX_05 | 130404 | 209147 | 0.623504 |
data['prop_r_vote'] = df_district_results['prop_r_vote']
full_data = data.set_index('Unnamed: 0')
full_data.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
older_proportion | proportion_white | proportion_male | born_in_TX | unemployment_rate | bucees | prop_r_vote | |
---|---|---|---|---|---|---|---|
Unnamed: 0 | |||||||
District 01 Estimate | 0.421819 | 0.770341 | 0.488964 | 0.773983 | 0.060640 | 0 | 0.723088 |
District 02 Estimate | 0.352724 | 0.676836 | 0.500970 | 0.651026 | 0.042236 | 0 | 0.528716 |
District 03 Estimate | 0.338312 | 0.685146 | 0.490954 | 0.555650 | 0.036385 | 0 | 0.542563 |
District 04 Estimate | 0.426503 | 0.812625 | 0.491080 | 0.725613 | 0.049219 | 0 | 0.756938 |
District 05 Estimate | 0.373478 | 0.773341 | 0.502425 | 0.769360 | 0.047108 | 1 | 0.623504 |
Initial regression with original features.
outcome_1 = 'prop_r_vote'
predictors = full_data.drop([outcome_1], axis=1)
pred_sum = "+".join(predictors.columns)
formula = outcome_1 + "~" + pred_sum
model = ols(formula= formula, data=full_data).fit()
model.summary()
Dep. Variable: | prop_r_vote | R-squared: | 0.793 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.750 |
Method: | Least Squares | F-statistic: | 18.54 |
Date: | Fri, 07 Dec 2018 | Prob (F-statistic): | 9.89e-09 |
Time: | 09:16:36 | Log-Likelihood: | 30.319 |
No. Observations: | 36 | AIC: | -46.64 |
Df Residuals: | 29 | BIC: | -35.55 |
Df Model: | 6 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -6.8105 | 1.462 | -4.658 | 0.000 | -9.801 | -3.820 |
older_proportion | 4.1519 | 0.644 | 6.448 | 0.000 | 2.835 | 5.469 |
proportion_white | 0.0565 | 0.218 | 0.259 | 0.798 | -0.390 | 0.503 |
proportion_male | 13.1086 | 2.967 | 4.418 | 0.000 | 7.040 | 19.177 |
born_in_TX | -0.8282 | 0.369 | -2.243 | 0.033 | -1.583 | -0.073 |
unemployment_rate | -2.4426 | 2.860 | -0.854 | 0.400 | -8.291 | 3.406 |
bucees | 0.0215 | 0.047 | 0.458 | 0.650 | -0.074 | 0.117 |
Omnibus: | 1.127 | Durbin-Watson: | 2.374 |
---|---|---|---|
Prob(Omnibus): | 0.569 | Jarque-Bera (JB): | 1.017 |
Skew: | -0.219 | Prob(JB): | 0.601 |
Kurtosis: | 2.302 | Cond. No. | 280. |
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
- Initial model outcome - prop_white and unemployment_rate both have insignificant p-values. We decide to remove these variables and retest.
- Based on p-values, we know that older_proportion, proportion_male and born_in_TX are relatively significant variables
- Our adjusted R2 is 0.750!!!
- Remove the insignificant variables and retest R2
outcome_2 = 'prop_r_vote'
predictors_2 = full_data.drop([outcome_2,"unemployment_rate",'proportion_white'], axis=1)
pred_sum_2 = "+".join(predictors_2.columns)
formula_2 = outcome_2 + "~" + pred_sum_2
model_2 = ols(formula= formula_2, data=full_data).fit()
model_2.summary()
Dep. Variable: | prop_r_vote | R-squared: | 0.786 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.758 |
Method: | Least Squares | F-statistic: | 28.48 |
Date: | Fri, 07 Dec 2018 | Prob (F-statistic): | 5.48e-10 |
Time: | 09:16:36 | Log-Likelihood: | 29.711 |
No. Observations: | 36 | AIC: | -49.42 |
Df Residuals: | 31 | BIC: | -41.50 |
Df Model: | 4 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -7.2504 | 1.338 | -5.419 | 0.000 | -9.979 | -4.522 |
older_proportion | 4.4164 | 0.525 | 8.409 | 0.000 | 3.345 | 5.488 |
proportion_male | 13.9595 | 2.723 | 5.127 | 0.000 | 8.406 | 19.513 |
born_in_TX | -1.0431 | 0.237 | -4.409 | 0.000 | -1.526 | -0.561 |
bucees | 0.0111 | 0.045 | 0.248 | 0.806 | -0.080 | 0.102 |
Omnibus: | 0.832 | Durbin-Watson: | 2.446 |
---|---|---|---|
Prob(Omnibus): | 0.660 | Jarque-Bera (JB): | 0.771 |
Skew: | -0.044 | Prob(JB): | 0.680 |
Kurtosis: | 2.289 | Cond. No. | 225. |
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
- Our adjusted R2 is now 0.758, which justifies the removal of the two variables
- The Beaver is significant in our hearts
#Need to relabel these terms before rerunning notebook; Original features only; picking highest p-values (and bucees)
X_train, X_test, y_train, y_test = train_test_split(predictors_2, full_data.prop_r_vote, test_size = 0.2)
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_hat_train = linreg.predict(X_train)
y_hat_test = linreg.predict(X_test)
train_residuals = y_hat_train - y_train
test_residuals = y_hat_test - y_test
train_mse = mean_squared_error(y_train, y_hat_train)
test_mse = mean_squared_error(y_test, y_hat_test)
print('Train Mean Squarred Error:', train_mse)
print('Test Mean Squarred Error:', test_mse)
Train Mean Squarred Error: 0.011204053454435784
Test Mean Squarred Error: 0.01329003411687122
X_test
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
older_proportion | proportion_male | born_in_TX | bucees | |
---|---|---|---|---|
Unnamed: 0 | ||||
District 09 Estimate | 0.316579 | 0.483160 | 0.769971 | 0 |
District 32 Estimate | 0.360520 | 0.494386 | 0.655495 | 0 |
District 18 Estimate | 0.302898 | 0.500825 | 0.794404 | 0 |
District 35 Estimate | 0.296198 | 0.504759 | 0.779531 | 1 |
District 33 Estimate | 0.288102 | 0.506449 | 0.834600 | 0 |
District 05 Estimate | 0.373478 | 0.502425 | 0.769360 | 1 |
District 07 Estimate | 0.320724 | 0.500733 | 0.662300 | 0 |
District 20 Estimate | 0.306944 | 0.492347 | 0.797475 | 0 |
Adding polynomial features and filter model testing -- an attempt to reduce MSE further.
features = full_data.iloc[:,:-1]
target = full_data.prop_r_vote
X_train_features, X_test_features, y_train_features, y_test_features = train_test_split(features, target, test_size = 0.2)
scaler = preprocessing.StandardScaler()
scaler.fit(features.iloc[:,:-1])
StandardScaler(copy=True, with_mean=True, with_std=True)
train_test_split(features, target, test_size = 0.2)
[ older_proportion proportion_white proportion_male \
Unnamed: 0
District 09 Estimate 0.316579 0.389004 0.483160
District 27 Estimate 0.412033 0.858957 0.493136
District 10 Estimate 0.346573 0.705403 0.496610
District 18 Estimate 0.302898 0.501001 0.500825
District 21 Estimate 0.397898 0.855105 0.491565
District 22 Estimate 0.341774 0.612455 0.493294
District 04 Estimate 0.426503 0.812625 0.491080
District 32 Estimate 0.360520 0.681317 0.494386
District 07 Estimate 0.320724 0.647546 0.500733
District 16 Estimate 0.344868 0.769642 0.491038
District 13 Estimate 0.401084 0.843155 0.511930
District 08 Estimate 0.374723 0.826691 0.508214
District 17 Estimate 0.338014 0.752277 0.496324
District 25 Estimate 0.373590 0.833757 0.494973
District 03 Estimate 0.338312 0.685146 0.490954
District 34 Estimate 0.358272 0.917820 0.498507
District 01 Estimate 0.421819 0.770341 0.488964
District 20 Estimate 0.306944 0.841677 0.492347
District 30 Estimate 0.323507 0.471969 0.482330
District 28 Estimate 0.348278 0.871987 0.488208
District 02 Estimate 0.352724 0.676836 0.500970
District 19 Estimate 0.356327 0.834719 0.507412
District 06 Estimate 0.351962 0.650283 0.487003
District 23 Estimate 0.349247 0.820505 0.508546
District 12 Estimate 0.361713 0.782285 0.491302
District 31 Estimate 0.333735 0.748680 0.493566
District 24 Estimate 0.324985 0.639870 0.493431
District 36 Estimate 0.396268 0.831016 0.497745
born_in_TX unemployment_rate bucees
Unnamed: 0
District 09 Estimate 0.769971 0.067354 0
District 27 Estimate 0.830769 0.052219 1
District 10 Estimate 0.697591 0.043842 1
District 18 Estimate 0.794404 0.071163 0
District 21 Estimate 0.666184 0.034397 0
District 22 Estimate 0.690154 0.057529 1
District 04 Estimate 0.725613 0.049219 0
District 32 Estimate 0.655495 0.046573 0
District 07 Estimate 0.662300 0.052475 0
District 16 Estimate 0.785423 0.063169 0
District 13 Estimate 0.744432 0.043290 0
District 08 Estimate 0.686633 0.051222 1
District 17 Estimate 0.768396 0.045468 0
District 25 Estimate 0.623470 0.043430 0
District 03 Estimate 0.555650 0.036385 0
District 34 Estimate 0.883663 0.058353 0
District 01 Estimate 0.773983 0.060640 0
District 20 Estimate 0.797475 0.056738 0
District 30 Estimate 0.783724 0.059916 0
District 28 Estimate 0.841652 0.064622 0
District 02 Estimate 0.651026 0.042236 0
District 19 Estimate 0.795769 0.049204 0
District 06 Estimate 0.701493 0.038909 0
District 23 Estimate 0.804793 0.053089 0
District 12 Estimate 0.685364 0.045634 0
District 31 Estimate 0.575662 0.044893 1
District 24 Estimate 0.612791 0.038372 0
District 36 Estimate 0.772807 0.077278 1 ,
older_proportion proportion_white proportion_male \
Unnamed: 0
District 15 Estimate 0.322342 0.833479 0.499603
District 26 Estimate 0.323349 0.776726 0.492675
District 05 Estimate 0.373478 0.773341 0.502425
District 29 Estimate 0.287191 0.783354 0.507916
District 11 Estimate 0.403707 0.842802 0.498522
District 14 Estimate 0.386338 0.706587 0.507594
District 33 Estimate 0.288102 0.644759 0.506449
District 35 Estimate 0.296198 0.693808 0.504759
born_in_TX unemployment_rate bucees
Unnamed: 0
District 15 Estimate 0.852012 0.061489 0
District 26 Estimate 0.577323 0.039043 1
District 05 Estimate 0.769360 0.047108 1
District 29 Estimate 0.881093 0.073009 0
District 11 Estimate 0.789328 0.042251 0
District 14 Estimate 0.743897 0.053311 1
District 33 Estimate 0.834600 0.062398 0
District 35 Estimate 0.779531 0.051702 1 ,
Unnamed: 0
District 09 Estimate 0.000000
District 27 Estimate 0.603432
District 10 Estimate 0.509048
District 18 Estimate 0.208210
District 21 Estimate 0.503409
District 22 Estimate 0.513977
District 04 Estimate 0.756938
District 32 Estimate 0.458546
District 07 Estimate 0.474999
District 16 Estimate 0.269988
District 13 Estimate 0.815603
District 08 Estimate 0.734666
District 17 Estimate 0.568509
District 25 Estimate 0.536135
District 03 Estimate 0.542563
District 34 Estimate 0.400248
District 01 Estimate 0.723088
District 20 Estimate 0.000000
District 30 Estimate 0.000000
District 28 Estimate 0.000000
District 02 Estimate 0.528716
District 19 Estimate 0.752710
District 06 Estimate 0.531335
District 23 Estimate 0.492222
District 12 Estimate 0.643019
District 31 Estimate 0.506303
District 24 Estimate 0.506766
District 36 Estimate 0.725771
Name: prop_r_vote, dtype: float64,
Unnamed: 0
District 15 Estimate 0.387092
District 26 Estimate 0.594074
District 05 Estimate 0.623504
District 29 Estimate 0.239198
District 11 Estimate 0.802031
District 14 Estimate 0.593351
District 33 Estimate 0.219267
District 35 Estimate 0.260746
Name: prop_r_vote, dtype: float64]
poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
features_plus_train = pd.DataFrame(poly.fit_transform(X_train_features), columns=poly.get_feature_names(X_train_features.columns))
features_plus_train.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
older_proportion | proportion_white | proportion_male | born_in_TX | unemployment_rate | bucees | older_proportion^2 | older_proportion proportion_white | older_proportion proportion_male | older_proportion born_in_TX | ... | proportion_male^2 | proportion_male born_in_TX | proportion_male unemployment_rate | proportion_male bucees | born_in_TX^2 | born_in_TX unemployment_rate | born_in_TX bucees | unemployment_rate^2 | unemployment_rate bucees | bucees^2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.360520 | 0.681317 | 0.494386 | 0.655495 | 0.046573 | 0.0 | 0.129975 | 0.245629 | 0.178236 | 0.236319 | ... | 0.244418 | 0.324068 | 0.023025 | 0.000000 | 0.429674 | 0.030529 | 0.000000 | 0.002169 | 0.000000 | 0.0 |
1 | 0.373478 | 0.773341 | 0.502425 | 0.769360 | 0.047108 | 1.0 | 0.139486 | 0.288826 | 0.187645 | 0.287339 | ... | 0.252431 | 0.386546 | 0.023668 | 0.502425 | 0.591914 | 0.036243 | 0.769360 | 0.002219 | 0.047108 | 1.0 |
2 | 0.296198 | 0.693808 | 0.504759 | 0.779531 | 0.051702 | 1.0 | 0.087734 | 0.205505 | 0.149509 | 0.230896 | ... | 0.254782 | 0.393475 | 0.026097 | 0.504759 | 0.607668 | 0.040303 | 0.779531 | 0.002673 | 0.051702 | 1.0 |
3 | 0.351962 | 0.650283 | 0.487003 | 0.701493 | 0.038909 | 0.0 | 0.123878 | 0.228875 | 0.171407 | 0.246899 | ... | 0.237172 | 0.341629 | 0.018949 | 0.000000 | 0.492093 | 0.027294 | 0.000000 | 0.001514 | 0.000000 | 0.0 |
4 | 0.374723 | 0.826691 | 0.508214 | 0.686633 | 0.051222 | 1.0 | 0.140418 | 0.309780 | 0.190440 | 0.257298 | ... | 0.258282 | 0.348957 | 0.026032 | 0.508214 | 0.471466 | 0.035171 | 0.686633 | 0.002624 | 0.051222 | 1.0 |
5 rows × 27 columns
features_plus_test = pd.DataFrame(poly.fit_transform(X_test_features), columns=poly.get_feature_names(X_test_features.columns))
features_plus_test.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
older_proportion | proportion_white | proportion_male | born_in_TX | unemployment_rate | bucees | older_proportion^2 | older_proportion proportion_white | older_proportion proportion_male | older_proportion born_in_TX | ... | proportion_male^2 | proportion_male born_in_TX | proportion_male unemployment_rate | proportion_male bucees | born_in_TX^2 | born_in_TX unemployment_rate | born_in_TX bucees | unemployment_rate^2 | unemployment_rate bucees | bucees^2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.356327 | 0.834719 | 0.507412 | 0.795769 | 0.049204 | 0.0 | 0.126969 | 0.297433 | 0.180805 | 0.283554 | ... | 0.257467 | 0.403783 | 0.024966 | 0.000000 | 0.633248 | 0.039155 | 0.000000 | 0.002421 | 0.000000 | 0.0 |
1 | 0.323507 | 0.471969 | 0.482330 | 0.783724 | 0.059916 | 0.0 | 0.104657 | 0.152685 | 0.156037 | 0.253540 | ... | 0.232642 | 0.378014 | 0.028899 | 0.000000 | 0.614224 | 0.046957 | 0.000000 | 0.003590 | 0.000000 | 0.0 |
2 | 0.361713 | 0.782285 | 0.491302 | 0.685364 | 0.045634 | 0.0 | 0.130837 | 0.282963 | 0.177710 | 0.247905 | ... | 0.241378 | 0.336720 | 0.022420 | 0.000000 | 0.469723 | 0.031276 | 0.000000 | 0.002082 | 0.000000 | 0.0 |
3 | 0.323349 | 0.776726 | 0.492675 | 0.577323 | 0.039043 | 1.0 | 0.104555 | 0.251154 | 0.159306 | 0.186677 | ... | 0.242729 | 0.284432 | 0.019236 | 0.492675 | 0.333301 | 0.022541 | 0.577323 | 0.001524 | 0.039043 | 1.0 |
4 | 0.322342 | 0.833479 | 0.499603 | 0.852012 | 0.061489 | 0.0 | 0.103904 | 0.268665 | 0.161043 | 0.274639 | ... | 0.249603 | 0.425668 | 0.030720 | 0.000000 | 0.725924 | 0.052389 | 0.000000 | 0.003781 | 0.000000 | 0.0 |
5 rows × 27 columns
def variance_threshold_selector(data, threshold=0.01):
#changed variance to .25
selector = VarianceThreshold(threshold)
selector.fit(data)
return data[data.columns[selector.get_support(indices=True)]]
features_selected_train = variance_threshold_selector(features_plus_train)
features_selected_train.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
bucees | older_proportion bucees | proportion_white^2 | proportion_white born_in_TX | proportion_white bucees | proportion_male bucees | born_in_TX^2 | born_in_TX bucees | bucees^2 | |
---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.000000 | 0.464193 | 0.446600 | 0.000000 | 0.000000 | 0.429674 | 0.000000 | 0.0 |
1 | 1.0 | 0.373478 | 0.598056 | 0.594977 | 0.773341 | 0.502425 | 0.591914 | 0.769360 | 1.0 |
2 | 1.0 | 0.296198 | 0.481369 | 0.540845 | 0.693808 | 0.504759 | 0.607668 | 0.779531 | 1.0 |
3 | 0.0 | 0.000000 | 0.422868 | 0.456169 | 0.000000 | 0.000000 | 0.492093 | 0.000000 | 0.0 |
4 | 1.0 | 0.374723 | 0.683418 | 0.567634 | 0.826691 | 0.508214 | 0.471466 | 0.686633 | 1.0 |
#review correlations in a heatmap to see if any are closely correlated;
sns.set(style="ticks")
corr = features_selected_train.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
<matplotlib.axes._subplots.AxesSubplot at 0x1c1b16f4e0>
# Create correlation matrix
corr_matrix = features_selected_train.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
features_selected_train.drop(columns=to_drop, inplace=True)
to_drop
/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py:3697: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
errors=errors)
['older_proportion bucees',
'proportion_white bucees',
'proportion_male bucees',
'born_in_TX bucees',
'bucees^2']
- These would be the features variables to drop as they contribute to excessive multicollinearity.
- We now start a mutual info regression test
def information_selector(X, y, scoring, k=5):
selector = SelectKBest(score_func=scoring, k=k)
selector.fit(X, y)
return X[X.columns[selector.get_support(indices=True)]]
test = SelectKBest(score_func=mutual_info_regression, k='all')
fit = test.fit(features_selected_train, y_train_features)
features_selected_train[features_selected_train.columns[fit.get_support(indices=True)]].head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
bucees | proportion_white^2 | proportion_white born_in_TX | born_in_TX^2 | |
---|---|---|---|---|
0 | 0.0 | 0.464193 | 0.446600 | 0.429674 |
1 | 1.0 | 0.598056 | 0.594977 | 0.591914 |
2 | 1.0 | 0.481369 | 0.540845 | 0.607668 |
3 | 0.0 | 0.422868 | 0.456169 | 0.492093 |
4 | 1.0 | 0.683418 | 0.567634 | 0.471466 |
features_selected_train = information_selector(features_selected_train, y_train_features, mutual_info_regression, k='all')
lm = linear_model.LinearRegression()
model = lm.fit(features_selected_train, y_train_features)
features_selected_test = features_plus_test[features_selected_train.columns]
y_pred_features = lm.predict(features_selected_test)
print(metrics.mean_absolute_error(y_test_features, y_pred_features))
print(metrics.mean_squared_error(y_test_features, y_pred_features))
print(np.sqrt(metrics.mean_squared_error(y_test_features, y_pred_features)))
0.09312303233941979
0.015682117506903823
0.12522826161415732
This attempt returned a MSE of 0.0156, which is about 50% higher than our Attempt 2 MSE of 0.00909, concluding that Attempt 2 was better!
lm = linear_model.LinearRegression()
def leaveoneout(df, num):
return df.reset_index().drop([num]).iloc[:,1:]
def loocv_arrays(df, dep):
n = len(df)
for i in range(n):
train_features = list(map(lambda i: leaveoneout(df, i).drop(dep, axis=1), range(n)))
train_y = list(map(lambda i: leaveoneout(df, i)[dep], range(n)))
test_features = list(map(lambda i: df.reset_index().drop(dep, axis=1).iloc[i, 1:], range(n)))
test_y = list(map(lambda i: df.reset_index().iloc[i, 1:][dep], range(n)))
return train_features, train_y, test_features, test_y
# train_x, train_y, test_x, test_y = loocv_arrays(full_data, 'prop_r_vote', 36)
def trainmses(df, dep):
train_x, train_y, test_x, test_y = loocv_arrays(df, dep)
train_mses = []
test_mses = []
r2 = []
adjr2 = []
n = len(df)
for i in range(n-1):
linreg.fit(train_x[i], train_y[i])
y_hat_train = linreg.predict(train_x[i])
train_residuals = y_hat_train - train_y[i]
train_mses.append(mean_squared_error(train_y[i], y_hat_train))
# need to get the y_hat for the test value
coeffs = linreg.coef_
intercept = linreg.intercept_
y_hat_test = sum(list(map(lambda x: x[0]*x[1], zip(coeffs, list(test_x[i])))))+intercept
test_residuals = y_hat_test - test_y[i]
test_mses.append(test_residuals**2)
rsquared = metrics.r2_score(train_y[i], y_hat_train)
r2.append(rsquared)
adjr2.append(1 - ((1-rsquared)*(n-1))/(n-7))
return 'training MSE: ' + str(pd.DataFrame(train_mses).mean()[0]) , 'test MSE: '+\
str(pd.DataFrame(test_mses).mean()[0]) , 'r2: ' + str(sum(r2)/n), 'adj: ' + str(sum(adjr2)/n)
trainmses(full_data, 'prop_r_vote')
('training MSE: 0.010801302256295605',
'test MSE: 0.016012846486860282',
'r2: 0.7719953377484944',
'adj: 0.7305690857884131')
trainmses(full_data.drop(["unemployment_rate",'proportion_white'], axis=1), 'prop_r_vote')
('training MSE: 0.011189077343740314',
'test MSE: 0.014737531979999562',
'r2: 0.7647770634482862',
'adj: 0.7218573754260927')