#load libraries
library(tidyverse)
library(scales)
library(caret)
library(broom)
library(modelr)
library(rsample)
library(janitor)
library(vroom)
#set up environment
options(scipen = 999, digits = 5)
theme_set(theme_bw())
In this post I will be modeling house (land parcel) sales in Pittsburgh. The data is from the WPRDC’s Parcels n’at dashboard or here.
The goal is to use linear modeling to predict the sale price of a house using features of the house and the property.
This code sets up the environment and loads the libraries I will use.
This reads the data and engineers some features.
#read in data
<- vroom("post_data/allegheny_county_master_file.csv", col_types = cols(.default = "c")) %>%
df clean_names() %>%
mutate(across(.cols = c(saleprice, finishedlivingarea, lotarea, yearblt,
bedrooms, fullbaths, halfbaths), parse_number))
#glimpse(df)
# df %>%
# select(saleprice, finishedlivingarea, lotarea, yearblt, bedrooms, fullbaths, halfbaths) %>%
# glimpse()
#
# df %>%
# select(contains("muni"))
<- df %>%
df mutate(munidesc = str_replace(munidesc, " - PITTSBURGH", "")) %>%
mutate(finishedlivingarea_log10 = log10(finishedlivingarea),
lotarea_log10 = log10(lotarea),
saleprice_log10 = log10(saleprice)) %>%
select(parid, classdesc, munidesc, schooldesc, neighdesc, taxdesc,
usedesc, homesteadflag, farmsteadflag, styledesc,
yearblt, extfinish_desc, roofdesc, basementdesc,
gradedesc, conditiondesc, stories, totalrooms, bedrooms,
fullbaths, halfbaths, heatingcoolingdesc, fireplaces,
bsmtgarage, finishedlivingarea, finishedlivingarea_log10,
lotarea, lotarea_log10, saledate,
saleprice, saleprice_log10)
#create grade vectors
<- c("average -", "average", "average +",
grades_standard "good -", "good", "good +",
"very good -", "very good", "very good +")
<- c("poor -", "poor", "poor +",
grades_below_average_or_worse "below average -", "below average", "below average +")
<- c("excellent -", "excellent", "excellent +",
grades_excellent_or_better "highest cost -", "highest cost", "highest cost +")
#subset data and engineer features
<- df %>%
df filter(classdesc == "RESIDENTIAL",
> 100,
saleprice str_detect(munidesc, "Ward"),
> 0,
finishedlivingarea > 0) %>%
lotarea select(parid, munidesc, schooldesc, neighdesc, taxdesc,
usedesc, homesteadflag, farmsteadflag, styledesc,
yearblt, extfinish_desc, roofdesc, basementdesc,
heatingcoolingdesc, gradedesc, conditiondesc, stories,
totalrooms, bedrooms, fullbaths, halfbaths, fireplaces,
bsmtgarage, finishedlivingarea_log10, lotarea_log10, %>%
saleprice_log10, saledate) mutate(usedesc = fct_lump(usedesc, n = 5),
styledesc = fct_lump(styledesc, n = 10),
#clean up and condense gradedesc
gradedesc = str_to_lower(gradedesc),
gradedesc = case_when(gradedesc %in% grades_below_average_or_worse ~ "below average + or worse",
%in% grades_excellent_or_better ~ "excellent - or better",
gradedesc %in% grades_standard ~ gradedesc),
gradedesc gradedesc = fct_relevel(gradedesc, c("below average + or worse", "average -", "average", "average +",
"good -", "good", "good +",
"very good -", "very good", "very good +", "excellent - or better")))
#replace missing character rows with "missing", change character columns to factor
<- df %>%
df mutate_if(is.character, replace_na, "missing") %>%
mutate_if(is.character, as.factor)
#select response and features
<- df %>%
df select(munidesc, usedesc, styledesc, conditiondesc, gradedesc,
finishedlivingarea_log10, lotarea_log10, yearblt, bedrooms, %>%
fullbaths, halfbaths, saleprice_log10) na.omit()
#muni_desc_levels <- levels(df$munidesc)
#view data
glimpse(df)
Rows: 74,687
Columns: 12
$ munidesc <fct> 1st Ward , 1st Ward , 1st Ward , 1st Ward , 1…
$ usedesc <fct> Other, SINGLE FAMILY, Other, Other, Other, Ot…
$ styledesc <fct> Other, TOWNHOUSE, Other, Other, Other, Other,…
$ conditiondesc <fct> AVERAGE, EXCELLENT, GOOD, AVERAGE, AVERAGE, G…
$ gradedesc <fct> very good +, excellent - or better, very good…
$ finishedlivingarea_log10 <dbl> 3.0993, 3.6170, 3.1844, 3.2057, 3.1173, 3.159…
$ lotarea_log10 <dbl> 3.0993, 3.0461, 3.2355, 3.1584, 3.1173, 3.159…
$ yearblt <dbl> 2007, 2012, 2007, 2007, 2015, 1905, 1905, 190…
$ bedrooms <dbl> 2, 3, 2, 2, 2, 1, 2, 2, 1, 2, 4, 3, 3, 4, 4, …
$ fullbaths <dbl> 2, 3, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 3, 2, …
$ halfbaths <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, …
$ saleprice_log10 <dbl> 6.0414, 6.2544, 5.8543, 5.8876, 5.6767, 5.531…
As shown in the data above, the model uses the following features to predict sale price:
- municipality name
- primary use of the parcel
- style of building
- condition of the structure
- grade of construction
- living area in square feet
- lot area in square feet
- year the house was built
- number of bedrooms
- number of full baths
- number of half-baths
This code sets up the data for cross validation.
#create initial split object
<- initial_split(df, prop = .75)
df_split
#extract training dataframe
<- training(df_split)
training_data_full
#extract testing dataframe
<- testing(df_split)
testing_data
distinct(training_data_full, munidesc)|>
anti_join(distinct(testing_data, munidesc))
# A tibble: 1 × 1
munidesc
<fct>
1 1st Ward - McKEESPORT
#find dimensions of training_data_full and testing_data
dim(training_data_full)
[1] 56015 12
dim(testing_data)
[1] 18672 12
This code divides the data into training and testing sets.
set.seed(42)
#prep the df with the cross validation partitions
<- vfold_cv(training_data_full, v = 5)
cv_split
<- cv_split %>%
cv_data mutate(
#extract train dataframe for each split
train = map(splits, ~training(.x)),
#extract validate dataframe for each split
validate = map(splits, ~testing(.x))
)
#view df
cv_data
# 5-fold cross-validation
# A tibble: 5 × 4
splits id train validate
<list> <chr> <list> <list>
1 <split [44812/11203]> Fold1 <tibble [44,812 × 12]> <tibble [11,203 × 12]>
2 <split [44812/11203]> Fold2 <tibble [44,812 × 12]> <tibble [11,203 × 12]>
3 <split [44812/11203]> Fold3 <tibble [44,812 × 12]> <tibble [11,203 × 12]>
4 <split [44812/11203]> Fold4 <tibble [44,812 × 12]> <tibble [11,203 × 12]>
5 <split [44812/11203]> Fold5 <tibble [44,812 × 12]> <tibble [11,203 × 12]>
This builds the model to predict house sale price.
#build model using the train data for each fold of the cross validation
<- cv_data %>%
cv_models_lm mutate(model = map(train, ~lm(formula = saleprice_log10 ~ ., data = .x)))
cv_models_lm
# 5-fold cross-validation
# A tibble: 5 × 5
splits id train validate model
<list> <chr> <list> <list> <list>
1 <split [44812/11203]> Fold1 <tibble [44,812 × 12]> <tibble> <lm>
2 <split [44812/11203]> Fold2 <tibble [44,812 × 12]> <tibble> <lm>
3 <split [44812/11203]> Fold3 <tibble [44,812 × 12]> <tibble> <lm>
4 <split [44812/11203]> Fold4 <tibble [44,812 × 12]> <tibble> <lm>
5 <split [44812/11203]> Fold5 <tibble [44,812 × 12]> <tibble> <lm>
#problem with factors split across training/validation
#https://stats.stackexchange.com/questions/235764/new-factors-levels-not-present-in-training-data
This is where I begin to calculate metrics to judge how well my model is doing.
<- cv_models_lm %>%
cv_prep_lm mutate(
#extract actual sale price for the records in the validate dataframes
validate_actual = map(validate, ~.x$saleprice_log10),
#predict response variable for each validate set using its corresponding model
validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y))
)
#View data
cv_prep_lm
# 5-fold cross-validation
# A tibble: 5 × 7
splits id train validate model validate_actual
<list> <chr> <list> <list> <list> <list>
1 <split [44812/11203]> Fold1 <tibble> <tibble> <lm> <dbl [11,203]>
2 <split [44812/11203]> Fold2 <tibble> <tibble> <lm> <dbl [11,203]>
3 <split [44812/11203]> Fold3 <tibble> <tibble> <lm> <dbl [11,203]>
4 <split [44812/11203]> Fold4 <tibble> <tibble> <lm> <dbl [11,203]>
5 <split [44812/11203]> Fold5 <tibble> <tibble> <lm> <dbl [11,203]>
# ℹ 1 more variable: validate_predicted <list>
#calculate fit metrics for each validate fold
<- cv_prep_lm %>%
cv_eval_lm mutate(validate_rmse = map2_dbl(model, validate, modelr::rmse),
validate_mae = map2_dbl(model, validate, modelr::mae))
<- cv_eval_lm %>%
cv_eval_lm mutate(fit = map(model, ~glance(.x))) %>%
unnest(fit)
#view data
%>%
cv_eval_lm select(id, validate_mae, validate_rmse, adj.r.squared)
# A tibble: 5 × 4
id validate_mae validate_rmse adj.r.squared
<chr> <dbl> <dbl> <dbl>
1 Fold1 0.328 0.447 0.428
2 Fold2 0.330 0.450 0.428
3 Fold3 0.329 0.448 0.430
4 Fold4 0.328 0.448 0.431
5 Fold5 0.326 0.442 0.428
Finally, this calculates how well the model did on the validation set.
#summarize fit metrics on cross-validated dfs
%>%
cv_eval_lm select(validate_mae, validate_rmse, adj.r.squared) %>%
summarize_all(mean)
# A tibble: 1 × 3
validate_mae validate_rmse adj.r.squared
<dbl> <dbl> <dbl>
1 0.328 0.447 0.429
#fit model on full training set
<- cv_data %>%
train_df select(train) %>%
unnest(train)
<- lm(formula = saleprice_log10 ~ ., data = train_df)
model_train
%>%
model_train glance()
# A tibble: 1 × 12
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.430 0.430 0.446 1941. 0 87 -137142. 274462. 275381.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
This is the RMSE on the training set
#calculate rmse on training set
rmse(model_train, train_df)
[1] 0.44626
This shows the impact each term of the model has on the response variable. This is for the training data.
#visualize estimates for terms
%>%
model_train tidy() %>%
filter(term != "(Intercept)") %>%
mutate(term = fct_reorder(term, estimate)) %>%
ggplot(aes(term, estimate)) +
geom_hline(yintercept = 0, linetype = 2, color = "red") +
geom_point() +
coord_flip()
Next, I apply the model to the testing data to see how the model does out-of-sample.
#create dfs for train_data_small and validate_data
#train_data_small <- cv_prep_lm %>%
# unnest(train) %>%
# select(-id)
<- cv_prep_lm %>%
validate_df select(validate) %>%
unnest()
This creates the augmented dataframe and plots the actual price vs. the fitted price.
#visualize model on validate data
<- augment(model_train, newdata = validate_df) %>%
augment_validate mutate(.resid = saleprice_log10 - .fitted)
#actual vs. fitted
%>%
cv_prep_lm unnest(validate_actual, validate_predicted) %>%
ggplot(aes(validate_actual, validate_predicted)) +
geom_abline() +
stat_density_2d(aes(fill = stat(level)), geom = "polygon") +
geom_smooth(method = "lm") +
scale_x_continuous(limits = c(2, 7)) +
scale_y_continuous(limits = c(2, 7)) +
coord_equal() +
scale_fill_viridis_c()
This distribution shows that the model overestimates the prices on many houses.
#distribution of residuals
%>%
augment_validate ggplot(aes(.resid)) +
geom_density() +
geom_vline(xintercept = 0, color = "red", linetype = 2)
This shows that the residuals are correlated with the actual price, which indicates that the model is failing to account for some dynamic in the sale process.
#sale price vs. residuals
%>%
augment_validate ggplot(aes(.resid, saleprice_log10)) +
stat_density_2d(aes(fill = stat(level)), geom = "polygon") +
geom_vline(xintercept = 0, color = "red", linetype = 2) +
scale_fill_viridis_c()
This calculates how well the model predicted sale price on out-of-sample testing data.
#calculate fit of model on test data
rmse(model_train, validate_df)
[1] 0.44626
mae(model_train, validate_df)
[1] 0.32743
rsquare(model_train, validate_df)
[1] 0.42988