Modeling Pittsburgh House Sales Linear

In this post I will be modeling house (land parcel) sales in Pittsburgh. The data is from the WPRDC’s Parcels n’at dashboard.

The goal is to use linear modeling to predict the sale price of a house using features of the house and the property.

This code sets up the environment and loads the libraries I will use.

#load libraries
library(tidyverse)
library(scales)
library(caret)
library(broom)
library(modelr)
library(rsample)
library(janitor)
library(vroom)

#set up environment
options(scipen = 999, digits = 5)

theme_set(theme_bw())

This reads the data and engineers some features.

#read in data
df <- vroom("data/assessments.csv", col_types = cols(.default = "c")) %>% 
  clean_names() %>% 
  mutate(across(.cols = c(saleprice, finishedlivingarea, lotarea, yearblt,
                          bedrooms, fullbaths, halfbaths), parse_number))

#glimpse(df)

# df %>% 
#   select(saleprice, finishedlivingarea, lotarea, yearblt, bedrooms, fullbaths, halfbaths) %>% 
#   glimpse()
# 
# df %>% 
#   select(contains("muni"))
df <- df %>% 
  mutate(munidesc = str_replace(munidesc, " - PITTSBURGH", "")) %>% 
  mutate(finishedlivingarea_log10 = log10(finishedlivingarea),
         lotarea_log10 = log10(lotarea),
         saleprice_log10 = log10(saleprice)) %>% 
  select(parid, classdesc, munidesc, schooldesc, neighdesc, taxdesc,
         usedesc, homesteadflag, farmsteadflag, styledesc,
         yearblt, extfinish_desc, roofdesc,  basementdesc,
         gradedesc, conditiondesc, stories, totalrooms, bedrooms,
         fullbaths, halfbaths, heatingcoolingdesc, fireplaces, 
         bsmtgarage, finishedlivingarea, finishedlivingarea_log10,
         lotarea, lotarea_log10, saledate,
         saleprice, saleprice_log10)

#create grade vectors
grades_standard <- c("average -", "average", "average +",
                     "good -", "good", "good +",
                     "very good -", "very good", "very good +")

grades_below_average_or_worse <- c("poor -", "poor", "poor +",
                                   "below average -", "below average", "below average +")

grades_excellent_or_better <- c("excellent -", "excellent", "excellent +",
                                "highest cost -", "highest cost", "highest cost +")

#subset data and engineer features
df <- df %>% 
  filter(classdesc == "RESIDENTIAL",
         saleprice > 100,
         str_detect(munidesc, "Ward"),
         finishedlivingarea > 0,
         lotarea > 0) %>% 
  select(parid, munidesc, schooldesc, neighdesc, taxdesc,
         usedesc, homesteadflag, farmsteadflag, styledesc,
         yearblt, extfinish_desc, roofdesc,  basementdesc, 
         heatingcoolingdesc, gradedesc, conditiondesc, stories, 
         totalrooms, bedrooms, fullbaths, halfbaths, fireplaces, 
         bsmtgarage, finishedlivingarea_log10, lotarea_log10, 
         saleprice_log10, saledate) %>% 
  mutate(usedesc = fct_lump(usedesc, n = 5),
         styledesc = fct_lump(styledesc, n = 10),
         #clean up and condense gradedesc
         gradedesc = str_to_lower(gradedesc),
         gradedesc = case_when(gradedesc %in% grades_below_average_or_worse ~ "below average + or worse",
                                    gradedesc %in% grades_excellent_or_better ~ "excellent - or better",
                                    gradedesc %in% grades_standard ~ gradedesc),
         gradedesc = fct_relevel(gradedesc, c("below average + or worse", "average -", "average", "average +",
                                                        "good -", "good", "good +",
                                                        "very good -", "very good", "very good +", "excellent - or better")))

#replace missing character rows with "missing", change character columns to factor
df <- df %>% 
  mutate_if(is.character, replace_na, "missing") %>% 
  mutate_if(is.character, as.factor)

#select response and features
df <- df %>% 
  select(munidesc, usedesc, styledesc, conditiondesc, gradedesc,
         finishedlivingarea_log10, lotarea_log10, yearblt, bedrooms, 
         fullbaths, halfbaths, saleprice_log10) %>% 
  na.omit()

#view data
glimpse(df)
## Rows: 73,679
## Columns: 12
## $ munidesc                 <fct> 1st Ward , 1st Ward , 1st Ward , 1st Ward , …
## $ usedesc                  <fct> SINGLE FAMILY, SINGLE FAMILY, SINGLE FAMILY,…
## $ styledesc                <fct> TOWNHOUSE, TOWNHOUSE, TOWNHOUSE, Other, Othe…
## $ conditiondesc            <fct> EXCELLENT, EXCELLENT, EXCELLENT, AVERAGE, AV…
## $ gradedesc                <fct> excellent - or better, excellent - or better…
## $ finishedlivingarea_log10 <dbl> 3.3034, 3.6170, 3.7042, 3.1173, 3.0993, 3.12…
## $ lotarea_log10            <dbl> 2.9978, 3.0461, 3.1477, 3.1173, 3.0993, 3.14…
## $ yearblt                  <dbl> 2015, 2012, 1920, 2015, 2007, 2007, 2015, 20…
## $ bedrooms                 <dbl> 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 4, 6,…
## $ fullbaths                <dbl> 3, 3, 3, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2,…
## $ halfbaths                <dbl> 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,…
## $ saleprice_log10          <dbl> 6.3010, 6.2544, 6.2380, 5.4695, 5.4033, 5.78…

As shown in the data above, the model uses the following features to predict sale price:

  • municipality name
  • primary use of the parcel
  • style of building
  • condition of the structure
  • grade of construction
  • living area in square feet
  • lot area in square feet
  • year the house was built
  • number of bedrooms
  • number of full baths
  • number of half-baths

This code sets up the data for cross validation.

#create initial split object
df_split <- initial_split(df, prop = .75)

#extract training dataframe
training_data_full <- training(df_split)

#extract testing dataframe
testing_data <- testing(df_split)

#find dimensions of training_data_full and testing_data
dim(training_data_full)
## [1] 55260    12
dim(testing_data)
## [1] 18419    12

This code divides the data into training and testing sets.

set.seed(42)

#prep the df with the cross validation partitions
cv_split <- vfold_cv(training_data_full, v = 5)

cv_data <- cv_split %>% 
  mutate(
    #extract train dataframe for each split
    train = map(splits, ~training(.x)), 
    #extract validate dataframe for each split
    validate = map(splits, ~testing(.x))
  )

#view df
cv_data
## #  5-fold cross-validation 
## # A tibble: 5 x 4
##   splits                id    train                  validate              
##   <list>                <chr> <list>                 <list>                
## 1 <split [44.2K/11.1K]> Fold1 <tibble [44,208 × 12]> <tibble [11,052 × 12]>
## 2 <split [44.2K/11.1K]> Fold2 <tibble [44,208 × 12]> <tibble [11,052 × 12]>
## 3 <split [44.2K/11.1K]> Fold3 <tibble [44,208 × 12]> <tibble [11,052 × 12]>
## 4 <split [44.2K/11.1K]> Fold4 <tibble [44,208 × 12]> <tibble [11,052 × 12]>
## 5 <split [44.2K/11.1K]> Fold5 <tibble [44,208 × 12]> <tibble [11,052 × 12]>

This builds the model to predict house sale price.

#build model using the train data for each fold of the cross validation
cv_models_lm <- cv_data %>% 
  mutate(model = map(train, ~lm(formula = saleprice_log10 ~ ., data = .x)))

cv_models_lm
## #  5-fold cross-validation 
## # A tibble: 5 x 5
##   splits                id    train                  validate              model
##   <list>                <chr> <list>                 <list>                <lis>
## 1 <split [44.2K/11.1K]> Fold1 <tibble [44,208 × 12]> <tibble [11,052 × 12… <lm> 
## 2 <split [44.2K/11.1K]> Fold2 <tibble [44,208 × 12]> <tibble [11,052 × 12… <lm> 
## 3 <split [44.2K/11.1K]> Fold3 <tibble [44,208 × 12]> <tibble [11,052 × 12… <lm> 
## 4 <split [44.2K/11.1K]> Fold4 <tibble [44,208 × 12]> <tibble [11,052 × 12… <lm> 
## 5 <split [44.2K/11.1K]> Fold5 <tibble [44,208 × 12]> <tibble [11,052 × 12… <lm>
#problem with factors split across training/validation
#https://stats.stackexchange.com/questions/235764/new-factors-levels-not-present-in-training-data

This is where I begin to calculate metrics to judge how well my model is doing.

cv_prep_lm <- cv_models_lm %>% 
  mutate(
    #extract actual sale price for the records in the validate dataframes
    validate_actual = map(validate, ~.x$saleprice_log10),
    #predict response variable for each validate set using its corresponding model
    validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y))
  )

#View data
cv_prep_lm
## #  5-fold cross-validation 
## # A tibble: 5 x 7
##   splits     id    train      validate    model validate_actual validate_predic…
##   <list>     <chr> <list>     <list>      <lis> <list>          <list>          
## 1 <split [4… Fold1 <tibble [… <tibble [1… <lm>  <dbl [11,052]>  <dbl [11,052]>  
## 2 <split [4… Fold2 <tibble [… <tibble [1… <lm>  <dbl [11,052]>  <dbl [11,052]>  
## 3 <split [4… Fold3 <tibble [… <tibble [1… <lm>  <dbl [11,052]>  <dbl [11,052]>  
## 4 <split [4… Fold4 <tibble [… <tibble [1… <lm>  <dbl [11,052]>  <dbl [11,052]>  
## 5 <split [4… Fold5 <tibble [… <tibble [1… <lm>  <dbl [11,052]>  <dbl [11,052]>
#calculate fit metrics for each validate fold       
cv_eval_lm <- cv_prep_lm %>% 
  mutate(validate_rmse = map2_dbl(model, validate, modelr::rmse),
         validate_mae = map2_dbl(model, validate, modelr::mae))

cv_eval_lm <- cv_eval_lm %>% 
  mutate(fit = map(model, ~glance(.x))) %>% 
  unnest(fit)
#view data
cv_eval_lm %>% 
  select(id, validate_mae, validate_rmse, adj.r.squared)
## # A tibble: 5 x 4
##   id    validate_mae validate_rmse adj.r.squared
##   <chr>        <dbl>         <dbl>         <dbl>
## 1 Fold1        0.304         0.417         0.474
## 2 Fold2        0.305         0.417         0.474
## 3 Fold3        0.297         0.411         0.471
## 4 Fold4        0.304         0.418         0.474
## 5 Fold5        0.301         0.419         0.477

Finally, this calculates how well the model did on the validation set.

#summarize fit metrics on cross-validated dfs
cv_eval_lm %>% 
  select(validate_mae, validate_rmse, adj.r.squared) %>% 
  summarize_all(mean)
## # A tibble: 1 x 3
##   validate_mae validate_rmse adj.r.squared
##          <dbl>         <dbl>         <dbl>
## 1        0.302         0.417         0.474
#fit model on full training set
train_df <- cv_data %>% 
  select(train) %>% 
  unnest(train)

model_train <- lm(formula = saleprice_log10 ~ ., data = train_df)

model_train %>% 
  glance()
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic p.value    df  logLik    AIC    BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>   <dbl>  <dbl>  <dbl>
## 1     0.474         0.474 0.416     2293.       0    87 -1.20e5 2.39e5 2.40e5
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

This is the RMSE on the training set

#calculate rmse on training set
rmse(model_train, train_df)
## [1] 0.41561

This shows the impact each term of the model has on the response variable. This is for the training data.

#visualize estimates for terms
model_train %>% 
  tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(term = fct_reorder(term, estimate)) %>% 
  ggplot(aes(term, estimate)) +
  geom_hline(yintercept = 0, linetype = 2, color = "red") +
  geom_point() +
  coord_flip()

Next, I apply the model to the testing data to see how the model does out-of-sample.

#create dfs for train_data_small and validate_data
#train_data_small <- cv_prep_lm %>% 
#  unnest(train) %>% 
#  select(-id)

validate_df <- cv_prep_lm %>% 
  select(validate) %>% 
  unnest()

This creates the augmented dataframe and plots the actual price vs. the fitted price.

#visualize model on validate data
augment_validate <- augment(model_train, newdata = validate_df) %>% 
  mutate(.resid = saleprice_log10 - .fitted)

#actual vs. fitted
cv_prep_lm %>% 
  unnest(validate_actual, validate_predicted) %>% 
  ggplot(aes(validate_actual, validate_predicted)) +
  geom_abline() +
  stat_density_2d(aes(fill = stat(level)), geom = "polygon") +
  geom_smooth(method = "lm") +
  scale_x_continuous(limits = c(2, 7)) +
  scale_y_continuous(limits = c(2, 7)) +
  coord_equal() +
  scale_fill_viridis_c()

This distribution shows that the model overestimates the prices on many houses.

#distribution of residuals
augment_validate %>% 
  ggplot(aes(.resid)) +
  geom_density() +
  geom_vline(xintercept = 0, color = "red", linetype = 2)

This shows that the residuals are correlated with the actual price, which indicates that the model is failing to account for some dynamic in the sale process.

#sale price vs. residuals
augment_validate %>% 
  ggplot(aes(.resid, saleprice_log10)) +
  stat_density_2d(aes(fill = stat(level)), geom = "polygon") +
  geom_vline(xintercept = 0, color = "red", linetype = 2) +
  scale_fill_viridis_c()

This calculates how well the model predicted sale price on out-of-sample testing data.

#calculate fit of model on test data
rmse(model_train, validate_df)
## [1] 0.41561
mae(model_train, validate_df)
## [1] 0.30157
rsquare(model_train, validate_df)
## [1] 0.47443

Related