Methodology

Land bank follow up

The land bank is a great program, but it’s a very limited program. How do officials decide which properties to remediate?

This is similar to the analysis we did for this storyon Land Banks.

After it ran, we got several tips of other counties where officials may have used this power to benefit themselves or others.

We are looking first at Lawrence County, where we found our most egregious tip.

First, the setup. These packages are included though it’s not the default to do that because we want others to be able to reproduce this work.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ tibble  3.1.8     ✔ purrr   0.3.5
## ✔ tidyr   1.2.1     ✔ stringr 1.4.1
## ✔ readr   2.1.3     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(lubridate)

## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(DT)
library(ggmap)

## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.

library(XML)
library(purrr)
library(leaflet)
library(sf)

## Linking to GEOS 3.10.2, GDAL 3.4.2, PROJ 8.2.1; sf_use_s2() is TRUE

library(readxl)
library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.1     ✔ rsample      1.1.0
## ✔ dials        1.0.0     ✔ tune         1.0.0
## ✔ infer        1.0.3     ✔ workflows    1.1.0
## ✔ modeldata    1.0.1     ✔ workflowsets 1.0.0
## ✔ parsnip      1.0.2     ✔ yardstick    1.1.0
## ✔ recipes      1.0.1     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

library(themis)
library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(randomForest)

## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(mlr3) 
library(mlr3spatiotempcv)
library(mlr3spatial)

## 
## Attaching package: 'mlr3spatial'
## 
## The following objects are masked from 'package:mlr3spatiotempcv':
## 
##     as_task_classif_st, as_task_classif_st.data.frame,
##     as_task_classif_st.DataBackend, as_task_classif_st.sf,
##     as_task_classif_st.TaskClassifST, as_task_regr_st,
##     as_task_regr_st.data.frame, as_task_regr_st.DataBackend,
##     as_task_regr_st.sf, as_task_regr_st.TaskClassifST,
##     as_task_regr_st.TaskRegrST, TaskClassifST, TaskRegrST

##Loading in files

setwd("~/Code/Housing_Equity_3")
Lawrence_LB_for_sale <- read_excel("LawrenceCounty_PFS-FullPage.xlsx") %>% 
  mutate(type="property_for_sale")

Lawrence_LB_sold <- rio::import("LawrenceCounty_PSold-FullPage.xlsx")%>% 
  mutate(type="sold_property")



Lawrence_LB_All <- rbind( 
  (Lawrence_LB_sold %>% dplyr::select(Address,Township,SalePrice,PropertyID,type)),  
  (Lawrence_LB_for_sale %>% dplyr::select(Address,Township,SalePrice,PropertyID,type))) 

Lawrence_LB_All <- Lawrence_LB_All %>% 
  distinct(PropertyID, .keep_all = TRUE) #This fixes 35-053-0900.000 which was in there twice. 

Lawrence_Land_data <- rio::import("lawrenceoh/DETINFO.txt") #from a FOIA


Lawrence_Delinquents_All <- rio::import("Lawrence Current delinquent parcels.xlsx") #From the county website


Lawrence_LB_sold$SaleDate <-  mdy(Lawrence_LB_sold$SaleDate)

ggplot(Lawrence_LB_sold, aes(x=SalePrice))+
  geom_histogram()+
  theme(axis.text.x = element_text(angle = 45, vjust = 1.2, hjust = 1.1))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

This shows the sale price of Land Bank Properties.

##Loading map data

Available through the Lawrence county FTP site. https://downloads.accuglobe.schneidergis.com/lawrenceoh/

setwd("~/Code/Housing_Equity_3/Lawrence Parcels/")
Lawrence_Geo <- sf::st_read("Parcels.shp")

## Reading layer `Parcels' from data source 
##   `/Users/Lucia/Code/Housing_Equity_3/Lawrence Parcels/Parcels.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 56373 features and 5 fields
## Geometry type: POLYGON
## Dimension:     XYZ
## Bounding box:  xmin: 1878194 ymin: 146773.4 xmax: 2029372 ymax: 309149.8
## z_range:       zmin: 0 zmax: 0
## CRS:           NA

#This comes ith with no CRS so we have to add it in.
"st_crs"(Lawrence_Geo) <-"+proj=lcc +lat_1=38.73333333333333 +lat_2=40.03333333333333 +lat_0=38 +lon_0=-82.5 +x_0=600000.0000000001 +y_0=0 +datum=NAD83 +units=us-ft +no_defs"

head(Lawrence_Geo)

## Simple feature collection with 6 features and 5 fields
## Geometry type: POLYGON
## Dimension:     XYZ
## Bounding box:  xmin: 1914536 ymin: 195922.6 xmax: 2022593 ymax: 266134.6
## z_range:       zmin: 0 zmax: 0
## CRS:           +proj=lcc +lat_1=38.73333333333333 +lat_2=40.03333333333333 +lat_0=38 +lon_0=-82.5 +x_0=600000.0000000001 +y_0=0 +datum=NAD83 +units=us-ft +no_defs
##            COMPNUM          PARCEL                            OWNER
## 1     320320100000 32-032-0100.000 CRUM JAMES C JR & TERESA ANNETTE
## 2 Unknown Parcel #            <NA>                             <NA>
## 3     320340500000 32-034-0500.000      HAMILTON DALLAS R & TERRI L
## 4 Unknown Parcel #            <NA>                             <NA>
## 5     182050800001 18-205-0800.001      THE HECLA WATER ASSOCIATION
## 6     182050700000 18-205-0700.000                   WHITE BILLY JR
##          ADDRESS ACRES                       geometry
## 1    0 ST RT 217  3.71 POLYGON Z ((1985082 201199....
## 2           <NA>    NA POLYGON Z ((1914607 195922....
## 3 4833 ST TR 378  0.80 POLYGON Z ((1979113 201557....
## 4           <NA>    NA POLYGON Z ((1925338 265438....
## 5     0 CO RD 73  0.50 POLYGON Z ((2021924 200852....
## 6  1389 CO RD 73 16.75 POLYGON Z ((2022000 201672....

Where are our land bank properties?

Lawrence_LB_Geo <- Lawrence_Geo %>% 
  filter(PARCEL %in%  Lawrence_LB_All$PropertyID)

Lawrence_LB_Geo <- st_as_sf(Lawrence_LB_Geo)

plot(Lawrence_LB_Geo %>% dplyr::select(PARCEL, geometry))

Now we want to see where the properties are with a background map.

setwd("~/Code/Housing_Equity_3/")
Lawrence_LB_Geo <- st_transform(Lawrence_LB_Geo, crs= 4326)

Lawrence_LB_Geo <- st_zm(Lawrence_LB_Geo, drop = T, what="ZM")

Lawrence_Properties_plotted <- leaflet(data = Lawrence_LB_Geo) %>% 
  addPolygons(data = Lawrence_LB_Geo$geometry) %>% 
  addTiles() %>% 
  setView(-82.53328542385822,38.57712964987033, zoom = 10) %>% #Set to Lawrence County Center
  addPolygons(data = Lawrence_LB_Geo$geometry,  options = tileOptions(minZoom = 0, maxZoom = 14, continuousWorld = T), popup = paste("Parcel Number: ", Lawrence_LB_Geo$PARCEL))
  
Lawrence_Properties_plotted

And where are the delinquent properties?

Lawrence_Delinquent_Geo <- Lawrence_Geo %>% 
  filter(PARCEL %in%  Lawrence_Delinquents_All$`Parcel Number`)

Lawrence_Delinquent_Geo <- st_transform(Lawrence_Delinquent_Geo, crs= 4326)

Lawrence_Delinquent_Geo <- st_as_sf(Lawrence_Delinquent_Geo)

plot(Lawrence_Delinquent_Geo %>% dplyr::select(PARCEL, geometry))

Adding a background map

Lawrence_Delinquent_Geo <- st_zm(Lawrence_Delinquent_Geo, drop = T, what="ZM")

Lawrence_Properties_plotted_with_delinquents <- leaflet(data = Lawrence_Delinquent_Geo) %>% 
  #addPolygons(data = Lawrence_LB_Geo$geometry, color ="blue") %>% 
  addTiles() %>% 
  setView(-82.53328542385822,38.57712964987033, zoom = 10) %>% #Set to Lawrence County Center
  addPolygons(data = Lawrence_LB_Geo$geometry, color = "blue", options = tileOptions(minZoom = 0, maxZoom = 14, continuousWorld = T), popup = paste("Parcel Number: ", Lawrence_LB_Geo$PARCEL)) %>% 
  addPolygons(data = Lawrence_Delinquent_Geo$geometry, color = "purple",   options = tileOptions(minZoom = 0, maxZoom = 14, continuousWorld = T), popup = paste("Parcel Number: ", Lawrence_Delinquent_Geo$PARCEL))
  
Lawrence_Properties_plotted_with_delinquents

###Combining Data

setwd("~/Code/Housing_Equity_3")


Lawrence_LB_with_Tax <- rio::import("~/Code/Housing_Equity_3/Lawrence Land Bank Property Values - Sheet1.csv")

Lawrence_data <- Lawrence_Geo %>% 
  filter(PARCEL %in% Lawrence_LB_with_Tax$PropertyID | PARCEL %in% Lawrence_Delinquents_All$`Parcel Number`) %>% 
  dplyr::select(-COMPNUM) %>% 
  #select(mpropertyNumber, TotValue, priorDelqOwedTot, propertyLand, ImprLand, CertDelqYear) %>%
  mutate(Land_Bank=if_else(PARCEL %in% Lawrence_LB_with_Tax$PropertyID,1,0)) %>% 
  mutate(amount_owed=if_else(PARCEL %in% Lawrence_LB_with_Tax$PropertyID,Lawrence_LB_with_Tax$Amount[match(PARCEL, Lawrence_LB_with_Tax$PropertyID)],as.character(Lawrence_Delinquents_All$Amount[match(PARCEL, Lawrence_Delinquents_All$`Parcel Number`)]))) %>%
  mutate(Certified_Delinquent_Year=if_else(PARCEL %in% Lawrence_LB_with_Tax$PropertyID,Lawrence_LB_with_Tax$Year_Certified_Delinquent[match(PARCEL, Lawrence_LB_with_Tax$PropertyID)], as.integer(Lawrence_Delinquents_All$`Certified Year`[match(PARCEL, Lawrence_Delinquents_All$`Parcel Number`)]))) %>% 
  mutate(years_delinquent=if_else(PARCEL %in% Lawrence_LB_with_Tax$PropertyID,Lawrence_LB_with_Tax$Years_On_Delinquent_Tax_Roll_Before_LB_Transfer[match(PARCEL, Lawrence_LB_with_Tax$PropertyID)],2022-Certified_Delinquent_Year)) 


Lawrence_data$amount_owed <- as.double(Lawrence_data$amount_owed)

## Warning: NAs introduced by coercion

There is a problem, though. This causes some properties to come up twice. called the Auditor’s office (they are the ones who put this data on the web) and they said that actually this was correct- they called it a “backsplit” and said sometimes people take out parcels, etc.

So that means these shapes have to be concatenated.

duplicates <- Lawrence_data %>% 
  group_by(PARCEL) %>% summarize(geometry=st_union(geometry)) #essentially putting those two shapes together 

not_dup <- Lawrence_data %>% 
  filter(!(PARCEL %in% duplicates$PARCEL))

dups <- Lawrence_data %>% 
  filter(PARCEL %in% duplicates$PARCEL) %>% 
  distinct(PARCEL, .keep_all = TRUE) %>% 
  select(-geometry) %>% 
  mutate(geometry=duplicates$geometry[match(PARCEL,duplicates$PARCEL)])

Lawrence_data <- rbind(not_dup, dups)

###Basic Data Exploration

summary(Lawrence_data)

##     PARCEL             OWNER             ADDRESS              ACRES        
##  Length:5202        Length:5202        Length:5202        Min.   :  0.000  
##  Class :character   Class :character   Class :character   1st Qu.:  0.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :  0.220  
##                                                           Mean   :  2.469  
##                                                           3rd Qu.:  1.060  
##                                                           Max.   :277.730  
##                                                                            
##    Land_Bank        amount_owed        Certified_Delinquent_Year
##  Min.   :0.00000   Min.   :     0.00   Min.   :1981             
##  1st Qu.:0.00000   1st Qu.:    18.93   1st Qu.:2015             
##  Median :0.00000   Median :   187.56   Median :2019             
##  Mean   :0.07382   Mean   :  1016.49   Mean   :2016             
##  3rd Qu.:0.00000   3rd Qu.:   949.10   3rd Qu.:2020             
##  Max.   :1.00000   Max.   :138631.13   Max.   :2021             
##                    NA's   :12          NA's   :19               
##  years_delinquent           geometry   
##  Min.   : 0.000   MULTIPOLYGON Z:  67  
##  1st Qu.: 2.000   POLYGON Z     :5135  
##  Median : 2.000   epsg:NA       :   0  
##  Mean   : 5.456   +proj=lcc ... :   0  
##  3rd Qu.: 6.000                        
##  Max.   :41.000                        
##  NA's   :19

Who now owns the most land bank properties?

datatable(Lawrence_data %>% filter(Land_Bank==1) %>%  group_by(OWNER) %>% summarize(Total=n()))

The bulk haven’t been sold but some people own quite a few. How many properties were just given to the land bank?

Lawrence_data %>% filter(Land_Bank==1) %>% filter(Certified_Delinquent_Year==0 | is.na(Certified_Delinquent_Year)) %>% nrow()

## [1] 19

Some properties were not delinquent- the amount owed is 0. We are going to exclude them because they were probably just given to the land bank. (This is legal if they agree.)

Lawrence_data %>% filter(amount_owed<=0) %>% nrow()

## [1] 2

Also there is no certified year for 5 properties. These are all land bank properties who owe little money. What does the distribution look like for the amount owed?

(Lawrence_data %>% filter(amount_owed<=0) %>% nrow()) / (Lawrence_data %>% nrow())

## [1] 0.0003844675

Lawrence_data <- Lawrence_data %>% filter(amount_owed>0)

Lawrence_data <- Lawrence_data %>% filter(Certified_Delinquent_Year>0)


ggplot(Lawrence_data, aes(amount_owed, color=Land_Bank)) +
  stat_bin(binwidth=1000, fill="white")+
  stat_bin(binwidth=1000, geom="text", aes(label=..count..), vjust=2.5)

What does the amount owed look like for those who do own money?

summary(Lawrence_data$amount_owed)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##      0.01     18.98    187.93   1018.06    950.91 138631.13

What’s the mean and median for all the properties in our dataset?

Lawrence_Delinquent_Year <-  Lawrence_data %>% filter(Land_Bank=="0") %>% filter(amount_owed>1000) 

mean(Lawrence_Delinquent_Year$years_delinquent, na.rm=TRUE)

## [1] 9.577398

What about land bank properties?

Land_Bank <- Lawrence_data %>% 
  filter(Land_Bank==1) 

summary(Land_Bank$amount_owed)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     4.99   434.75  1208.31  2247.15  2923.28 17613.61

Looking at that visually:

ggplot(Land_Bank, aes(amount_owed, color=Land_Bank)) +
  stat_bin(binwidth=1000, fill="white")+
  stat_bin(binwidth=1000, geom="text", aes(label=..count..), vjust=2.5)

How many owe less than $5K?

Land_Bank %>% filter(amount_owed<5000) %>% nrow()

## [1] 316

What percentage is that? Note, nore are being added all the time, as of the date of analysis, there were 362 properties in the land bank or had gone through it.

316/362

## [1] 0.8729282

Now let’s look at how long properties have been on the delinquent list, or were on the delinquent list before remdiation (or not). Keep in mind that to be on the delinquent list, you have to have not paid your taxes for an entire year. (In Ohio.)

ggplot(Lawrence_data, aes(x=years_delinquent))+
  geom_bar()+
  stat_bin(binwidth=1, geom="text", aes(label=..count..), vjust=-2.5)

When did most sales take place?

ggplot(Lawrence_LB_sold, aes(x=year(SaleDate)))+
  geom_bar()

How much did non-land bank properties owe?

Non_Land_Bank <- 
  Lawrence_data %>% filter(Land_Bank==0) 

summary(Non_Land_Bank$amount_owed)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##      0.01     15.58    152.94    925.71    838.41 138631.13

Looking at that visually

ggplot(Non_Land_Bank, aes(amount_owed) )+
  stat_bin(binwidth=1000, fill="white")+
  stat_bin(binwidth=1000, geom="text", aes(label=..count..), vjust=2.5)

So most people are only a few years behind but some have been on the list quite a while.

Note: It looks like Covid messed us up a bit as there are very few 2021 delinquents, probably because of the freeze.

Just looking at land bank properties now:

ggplot(Land_Bank, aes(x=years_delinquent))+
  geom_bar()+
  stat_bin(binwidth=1, geom="text", aes(label=..count..), vjust=-2.5)

Versus non-land-bank

ggplot(Non_Land_Bank, aes(x=years_delinquent))+
  geom_bar()+
  stat_bin(binwidth=1, geom="text", aes(label=..count..), vjust=-2.5)

#Looking for properties close to our data that might sway the data

Lawrence_River <- Lawrence_Geo %>% 
  filter(COMPNUM=="RIVER") %>% 
  dplyr::select(COMPNUM, geometry)


Lawrence_key <- Lawrence_River


plot(Lawrence_River)

##adding the distance to the river

library(lwgeom)

## Linking to liblwgeom 3.0.0beta1 r16016, GEOS 3.10.2, PROJ 8.2.1

sf::sf_use_s2(FALSE)

## Spherical geometry (s2) switched off

Lawrence_key <- st_transform(Lawrence_key, crs= 4326)

Lawrence_key <- st_as_sf(Lawrence_key)

#Lawrence_LB_Geo <- Lawrence_LB_Geo %>% distinct(PARCEL, .keep_all = TRUE)

Lawrence_data <- st_transform(Lawrence_data, crs= 4326)

Lawrence_data <- st_as_sf(Lawrence_data)

#Lawrence_data <- Lawrence_data %>% distinct(PARCEL, .keep_all = TRUE)

Lawrence_Distance_Matrix <- as.data.frame(st_distance(  Lawrence_key$geometry, Lawrence_data$geometry)) #This takes quite a while just FYI

  #Sets column names
  
Lawrence_Distance_Matrix1 <- Lawrence_Distance_Matrix %>% 
  `colnames<-`(Lawrence_data$PARCEL ) # %>% 
  
  #Adds a column containing names so that each row now also has a name
 
#Lawrence_Distance_Matrix1 is the same as Lawrence_Distance_Matrix but with the parcel number down the side  and column names on top so we don't get confused. But obviously that is just a label and not actually part of the calculations (aka a bad data practice) which is why we keep Lawrence Distance Matrix as well.

library(units)

## udunits database from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/units/share/udunits/udunits2.xml

Lawrence_Distance_Matrix_No_Units <- drop_units(Lawrence_Distance_Matrix1)



Lawrence_Smallest_Distance <- Lawrence_Distance_Matrix_No_Units %>%                                                                                  
  dplyr::summarise_all(~min(.))

Lawrence_Distance_Info1 <-Lawrence_Smallest_Distance %>% 
  pivot_longer("18-205-0900.000":"12-076-1600.022", names_to = "PARCEL", values_to = "Distance_To_River_In_Meters" )



Lawrence_data <- left_join(Lawrence_data, Lawrence_Distance_Info1, by="PARCEL")

Okay next let’s take a look at how our variables correlate with the outcome.

Lawrence_ML  <-  Lawrence_data %>% 
  mutate(Land_Bank=as.factor(Land_Bank)) %>%  #`mapping` color column must be categorical, not numeric
  dplyr::select(-OWNER, -ADDRESS, -Certified_Delinquent_Year, -PARCEL, -ACRES)  #factors that wouldn't influence the data (like owner) or we don't have enough data (Total Value ) Also apparently quite a few parcels are listed as 0 acres legally though they actually do have size in other places. See, eg https://lawrencecountyauditor.org/Parcel?Parcel=32-078-1600.000

Lawrence_ML  <- as.data.frame(Lawrence_ML) #because sf dataframes act weird here

Lawrence_ML  <- Lawrence_ML %>% 
    dplyr::select(-geometry)
  
ggpairs(Lawrence_ML, aes(col=Land_Bank), progress = FALSE)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

###Machine Learning

This is based off of the Ecuador landslides project. Researchers used machine learning to predict where the next landslide would be next to a road in the rainforest.

Here, we are going to use past data on where remediations took place to see if we can predict where future delinquent properties will go into the land bank.

#Creating a Spatial Task


library(mlr3spatiotempcv)
library(geosphere)

Lawrence_Spatial_ML <- Lawrence_data %>% 
  dplyr::select(-PARCEL, -OWNER, -ADDRESS, -Certified_Delinquent_Year) %>% 
  mutate(Centroid_Coordinates=st_centroid(geometry))

## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data

data_sf <- as_tibble(cbind(Lawrence_Spatial_ML$Centroid_Coordinates, Lawrence_Spatial_ML$ACRES, Lawrence_Spatial_ML$Land_Bank, Lawrence_Spatial_ML$amount_owed, Lawrence_Spatial_ML$years_delinquent, Lawrence_Spatial_ML$Distance_To_River_In_Meters ))

## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.

data_sf <- data_sf %>% 
  rename(Centroid_Coordinates=V1, ACRES=V2,Land_Bank=V3, amount_owed=V4, years_delinquent=V5, Distance_To_River_In_Meters=V6 )

data_sf <- separate(data_sf, Centroid_Coordinates, c("x","y"), sep = ",", remove = TRUE)


data_sf <- data_sf %>% 
  mutate(x=str_remove(x,"^..")) %>% 
  mutate(y=str_remove(y,".$"))


data_sf$ACRES <- as.numeric(data_sf$ACRES)
data_sf$Land_Bank <- as.character(data_sf$Land_Bank)
data_sf$Land_Bank <- as.factor(data_sf$Land_Bank)
data_sf$amount_owed <- as.numeric(data_sf$amount_owed)
data_sf$years_delinquent <- as.integer(data_sf$years_delinquent)
data_sf$Distance_To_River_In_Meters <- as.numeric(data_sf$Distance_To_River_In_Meters)


data_sf <- data_sf %>% 
  filter(amount_owed>0) %>% 
  filter(years_delinquent>2) # We had to add this in because covid affected the foreclosure rates and policies. 



# create 'sf' object
data_sf = sf::st_as_sf(data_sf, coords = c("x", "y"), crs = 4326)

task = as_task_classif_st(data_sf, id = "Lawrence_Spatial_Task", target = "Land_Bank", positive = "1" )

print(task)

## <TaskClassifST:Lawrence_Spatial_Task> (2583 x 5)
## * Target: Land_Bank
## * Properties: twoclass
## * Features (4):
##   - dbl (3): ACRES, Distance_To_River_In_Meters, amount_owed
##   - int (1): years_delinquent
## * Coordinates:
##               X        Y
##    1: -82.37565 38.45147
##    2: -82.53373 38.45511
##    3: -82.43877 38.43831
##    4: -82.38329 38.51631
##    5: -82.43786 38.43916
##   ---                   
## 2579: -82.32732 38.57720
## 2580: -82.57544 38.43334
## 2581: -82.54996 38.71156
## 2582: -82.63659 38.58352
## 2583: -82.60848 38.54668

Note: because there are so few land bank properties in our dataset, this makes the task extremely difficult. We tried undersampling, but got various results when we looked at intrepreting the model, so that didn’t work. For more on this, highly suggest you check out this link

table(task$truth())

## 
##    1    0 
##  307 2276

##creating spatial validation Note: you have to use devtools::install_github(“mlr-org/mlr3extralearners”) to get the extra learners because the example in mlr3 only has a simple decision tree and it’s much better to have a random forest model or outliers will have much more influence. And this required installing the apcluster, mlr3proba, ooplah, and dictionar6 packages mlr3extralearners::install_learners(“classif.randomForest”)

One of the largest issues of spatiotemporal data analysis is the inevitable presence of auto-correlation in the data. In other words, properties that are near each other are just more likely to be alike anyway. Using a spatial task gives us a bias-reduced performance estimation. For more see here

library(mlr3extralearners)

learner = lrn("classif.randomForest", predict_type = "prob")
resampling_sp = rsmp("repeated_spcv_coords", folds = 4, repeats = 2)
rr_sp = mlr3::resample(
  task = task, learner = learner,
  resampling = resampling_sp)

## INFO  [16:13:10.646] [mlr3] Applying learner 'classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 1/8)
## INFO  [16:13:11.892] [mlr3] Applying learner 'classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 2/8)
## INFO  [16:13:12.394] [mlr3] Applying learner 'classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 3/8)
## INFO  [16:13:13.157] [mlr3] Applying learner 'classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 4/8)
## INFO  [16:13:13.600] [mlr3] Applying learner 'classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 5/8)
## INFO  [16:13:14.316] [mlr3] Applying learner 'classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 6/8)
## INFO  [16:13:14.796] [mlr3] Applying learner 'classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 7/8)
## INFO  [16:13:15.710] [mlr3] Applying learner 'classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 8/8)

rr_sp$aggregate(measures = msr("classif.ce")) #measuring classification error

## classif.ce 
##  0.1081635

rr_sp$aggregate(measures = msr("classif.acc")) #mesuring accuracy

## classif.acc 
##   0.8918365

Looking at a confusion matrix:

# split into training and test
splits = partition(task, ratio = 0.8)
print(str(splits))

## List of 2
##  $ train: int [1:2067] 1 34 53 98 108 109 136 152 153 187 ...
##  $ test : int [1:516] 32 33 52 104 140 264 287 395 402 485 ...
## NULL

pred = learner$train(task, splits$train)$predict(task, splits$test)
pred$confusion

##         truth
## response   1   0
##        1  14   4
##        0  47 451

Looking good, though there are more fale positives than negatives. For reference:

The upper left quadrant-the number of times our model predicted the positive class and was correct about it. The lower right quadrant- the number of times our model predicted the negative class and was also correct about it. (Together, the elements on the diagonal are called True Positives and True Negatives.) The upper right quadrant- the number of times we falsely predicted a positive label and is called False Positives. The lower left quadrant- False Negatives (FN).

We wanted to look at this to make sure that we are not training an algorithm to just guess no most of the time (because usually the answer is no) and we dont’ want to get a better accuracy at the cost of having a bad model.

#Vizualizing this

For example, here are the first four partitions of the first repetition

autoplot(resampling_sp, task, fold_id = c(1:4), size = 0.7) *
  ggplot2::scale_y_continuous(breaks = seq(-37.57, -39.57, -0.01)) *
  ggplot2::scale_x_continuous(breaks = seq(-81.53, -83.53 -0.02))

#Feature importance But how important is the location of each one?

library(mlr3spatial)
library(mlr3fselect)

instance = fselect(
  method = "random_search",
  task =  task,
  learner = learner,
  resampling = rsmp("holdout"),
  measure = msr("classif.ce"),
  term_evals = 10,
  batch_size = 5
)

## INFO  [16:13:18.902] [bbotk] Starting to optimize 4 parameter(s) with '<FSelectorRandomSearch>' and '<TerminatorEvals> [n_evals=10, k=0]'
## INFO  [16:13:18.904] [bbotk] Evaluating 5 configuration(s)
## INFO  [16:13:19.322] [mlr3] Running benchmark with 5 resampling iterations
## INFO  [16:13:19.326] [mlr3] Applying learner 'select.classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 1/1)
## INFO  [16:13:19.756] [mlr3] Applying learner 'select.classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 1/1)
## INFO  [16:13:20.185] [mlr3] Applying learner 'select.classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 1/1)
## INFO  [16:13:20.910] [mlr3] Applying learner 'select.classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 1/1)
## INFO  [16:13:21.350] [mlr3] Applying learner 'select.classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 1/1)
## INFO  [16:13:21.722] [mlr3] Finished benchmark
## INFO  [16:13:21.915] [bbotk] Result of batch 1:
## INFO  [16:13:21.916] [bbotk]  ACRES Distance_To_River_In_Meters amount_owed years_delinquent classif.ce
## INFO  [16:13:21.916] [bbotk]   TRUE                       FALSE       FALSE             TRUE  0.1149826
## INFO  [16:13:21.916] [bbotk]   TRUE                       FALSE        TRUE             TRUE  0.1161440
## INFO  [16:13:21.916] [bbotk]  FALSE                        TRUE        TRUE             TRUE  0.1149826
## INFO  [16:13:21.916] [bbotk]   TRUE                       FALSE        TRUE             TRUE  0.1161440
## INFO  [16:13:21.916] [bbotk]  FALSE                        TRUE       FALSE            FALSE  0.2195122
## INFO  [16:13:21.916] [bbotk]  runtime_learners                                uhash
## INFO  [16:13:21.916] [bbotk]             0.420 24e64a45-d231-4258-b9fb-f6dcb17eef1b
## INFO  [16:13:21.916] [bbotk]             0.420 6e4ee17b-30b6-49ca-b26c-df9db6b47fa6
## INFO  [16:13:21.916] [bbotk]             0.717 9227791e-c1fd-4ebc-8d9f-14a43a964c91
## INFO  [16:13:21.916] [bbotk]             0.432 0bd191ef-3a58-4657-b76e-bdcb0dfcad86
## INFO  [16:13:21.916] [bbotk]             0.366 37ac3077-8c11-470c-b961-58c880670a32
## INFO  [16:13:21.917] [bbotk] Evaluating 5 configuration(s)
## INFO  [16:13:22.521] [mlr3] Running benchmark with 5 resampling iterations
## INFO  [16:13:22.526] [mlr3] Applying learner 'select.classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 1/1)
## INFO  [16:13:22.848] [mlr3] Applying learner 'select.classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 1/1)
## INFO  [16:13:23.244] [mlr3] Applying learner 'select.classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 1/1)
## INFO  [16:13:23.701] [mlr3] Applying learner 'select.classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 1/1)
## INFO  [16:13:24.059] [mlr3] Applying learner 'select.classif.randomForest' on task 'Lawrence_Spatial_Task' (iter 1/1)
## INFO  [16:13:24.418] [mlr3] Finished benchmark
## INFO  [16:13:24.560] [bbotk] Result of batch 2:
## INFO  [16:13:24.561] [bbotk]  ACRES Distance_To_River_In_Meters amount_owed years_delinquent classif.ce
## INFO  [16:13:24.561] [bbotk]   TRUE                       FALSE       FALSE             TRUE  0.1149826
## INFO  [16:13:24.561] [bbotk]  FALSE                       FALSE        TRUE             TRUE  0.1091754
## INFO  [16:13:24.561] [bbotk]   TRUE                        TRUE        TRUE            FALSE  0.1324042
## INFO  [16:13:24.561] [bbotk]   TRUE                       FALSE        TRUE            FALSE  0.1370499
## INFO  [16:13:24.561] [bbotk]   TRUE                       FALSE       FALSE             TRUE  0.1149826
## INFO  [16:13:24.561] [bbotk]  runtime_learners                                uhash
## INFO  [16:13:24.561] [bbotk]             0.313 529bbaed-3fb5-409f-bfed-be7c1cc3ad58
## INFO  [16:13:24.561] [bbotk]             0.387 8e6b8eec-e9f3-4f57-a5a7-15d78f244e98
## INFO  [16:13:24.561] [bbotk]             0.449 5fb66fde-3b58-422d-9016-f3a7b064a44a
## INFO  [16:13:24.561] [bbotk]             0.348 47b929d1-a28e-43b4-8225-6ea132bd5ffd
## INFO  [16:13:24.561] [bbotk]             0.352 9c77b6f0-0de0-49a6-a2f7-571f6b6a546e
## INFO  [16:13:24.565] [bbotk] Finished optimizing after 10 evaluation(s)
## INFO  [16:13:24.565] [bbotk] Result:
## INFO  [16:13:24.566] [bbotk]  ACRES Distance_To_River_In_Meters amount_owed years_delinquent
## INFO  [16:13:24.566] [bbotk]  FALSE                       FALSE        TRUE             TRUE
## INFO  [16:13:24.566] [bbotk]                      features classif.ce
## INFO  [16:13:24.566] [bbotk]  amount_owed,years_delinquent  0.1091754

How important is each one?

library(iml)


Lawrence_data_no_geometry <- data_sf %>% 
  st_drop_geometry() 
  

x = data_sf[which(names(data_sf) != "Land_Bank")]
model = Predictor$new(learner, data = Lawrence_data_no_geometry, y = "Land_Bank")
x.interest = data.frame(Lawrence_data_no_geometry[1, ])
shapley = Shapley$new(model, x.interest = x.interest)
plot(shapley)

#Looking at the decision tree

library(rpart)

## 
## Attaching package: 'rpart'

## The following object is masked from 'package:dials':
## 
##     prune

library(rpart.plot)

binary.model <- rpart(learner$model, data = data_sf)
rpart.plot(binary.model, cex=1, digits = 6)

And how can we articulate this?

rpart.rules(binary.model)

##  Land_Bank                                                                                                                    
##       0.00 when Distance_To_River_In_Meters <  430 & years_delinquent >=       11 & amount_owed >=         1780 & ACRES >= 0.3
##       0.03 when Distance_To_River_In_Meters >= 430 & years_delinquent >=       38                                             
##       0.05 when Distance_To_River_In_Meters >= 430 & years_delinquent <   9                                                   
##       0.11 when Distance_To_River_In_Meters >= 430 & years_delinquent is 11 to 34                                             
##       0.15 when Distance_To_River_In_Meters <  430 & years_delinquent >=       11 & amount_owed <  1780                       
##       0.19 when Distance_To_River_In_Meters <  430 & years_delinquent <   9                                                   
##       0.26 when Distance_To_River_In_Meters <  430 & years_delinquent >=       11 & amount_owed >=         6415 & ACRES <  0.3
##       0.64 when Distance_To_River_In_Meters <  430 & years_delinquent >=       11 & amount_owed is 1780 to 6415 & ACRES <  0.3
##       0.75 when Distance_To_River_In_Meters >= 430 & years_delinquent is 34 to 38                                             
##       1.00 when Distance_To_River_In_Meters >= 430 & years_delinquent is  9 to 11                                             
##       1.00 when Distance_To_River_In_Meters <  430 & years_delinquent is  9 to 11

And where does the computer make predictions?

lrn = learner
lrn$train(task)

tsk_predict = as_task_unsupervised(data_sf)

# predict task based on point data
data_sf$Land_Bank = NULL # remove response
task_predict = as_task_unsupervised(data_sf)

pred = predict_spatial(task_predict, learner)

plot(pred)

Plotting just the predictions as the Land Bank gets a little lost there:

plot(filter(pred, Land_Bank==1))

Methodology

Lucia Walinchus with help from Sara Stoudt

2022-11-11

Land bank follow up