From 4f04dc5b595ece5a6743413e35c25b68a0bfd04b Mon Sep 17 00:00:00 2001 From: GreatEmerald Date: Mon, 10 Jan 2022 11:31:23 +0100 Subject: [PATCH] Update tutorial to use ranger instead of randomforest --- index.Rmd | 16 +- index.html | 861 ++++++++++++++++++++++++++--------------------------- 2 files changed, 437 insertions(+), 440 deletions(-) diff --git a/index.Rmd b/index.Rmd index 0ce29dd..54afc5c 100644 --- a/index.Rmd +++ b/index.Rmd @@ -11,7 +11,12 @@ output: highlight.chooser: TRUE --- -# [WUR Geoscripting](https://geoscripting-wur.github.io/) WUR logo + + +# [WUR Geoscripting](https://geoscripting-wur.github.io/) WUR logo ```{r, echo=FALSE, message=FALSE} library(knitr) @@ -32,7 +37,7 @@ Morning | Self-study: go through the following tutorial Rest of the afternoon | Do/finalise the exercise. --> -# Week 2, Lesson 7: Advanced Raster Analysis +# Week 2, Lesson 6: Advanced Raster Analysis ## Learning outcomes of today: @@ -482,7 +487,9 @@ We could take this approach further and apply a minimum mapping unit (MMU) to ou As we have seen with the land cover rasters we derived using the random forest above, the values of a raster may be categorical, meaning they relate to a thematic class (e.g. 'forest' or 'wetland') rather than a quantitative value (e.g. NDVI or % Tree Cover). The raster dataset 'lulcGewata' is a raster with integer values representing Land Use and Land Cover (LULC) classes from a 2011 classification (using SPOT5 and ASTER source data). ```{r} -load("data/lulcGewata.rda") +download.file("https://github.com/GeoScripting-WUR/AdvancedRasterAnalysis/releases/download/thematic-data/lulcGewata.zip", "data/lulcGewata.zip") +unzip("data/lulcGewata.zip", exdir="data") +lulcGewata <- raster("data/lulcGewata.tif") ## Check out the distribution of the values freq(lulcGewata) hist(lulcGewata) @@ -491,7 +498,8 @@ hist(lulcGewata) This is a raster with integer values between 1 and 6, but for this raster to be meaningful at all, we need a lookup table (LUT) to identify these classes. A `data.frame` defining these classes is also included in the lesson repository: ```{r} -load("data/LUTGewata.rda") +LUTGewata <- read.csv("data/LUTGewata.csv") +LUTGewata[[1]] <- NULL # Remove the name column LUTGewata ``` diff --git a/index.html b/index.html index 7c41dbe..7522d5e 100644 --- a/index.html +++ b/index.html @@ -7,12 +7,144 @@ Advanced Raster Analysis @@ -873,7 +1005,11 @@
-

WUR Geoscripting WUR logo

+ +

WUR Geoscripting WUR logo

@@ -891,7 +1027,7 @@

WUR Geos 14:00 to 14:40 | Presentation and discussion Rest of the afternoon | Do/finalise the exercise. --> -

Week 2, Lesson 7: Advanced Raster Analysis

+

Week 2, Lesson 6: Advanced Raster Analysis

Learning outcomes of today:

Being able to:

-
-

When we plot the histogram of the RasterBrick, the scales of the axes and the bin sizes are not equivalent, which could be problematic. This can be solved by adjusting these paramters in hist(). The raster hist() function inherits arguments from the function of the same name from the graphics package. To view additional arguments, type:

-
- -
?graphics::hist
-
-

To ensure that our histograms are of the same scale, we should consider the xlim, ylim and breaks arguments.

+

You might get different values for cellStats an maxValue, the reason for this is that the cellStats looks at all the pixel values whereas the maxValue function takes a subset of the pixel values. As a result maxValue is faster than cellStats.

-
par(mfrow = c(1, 1)) # reset plotting window
-hist(gewata, xlim = c(0, 5000), ylim = c(0, 750000), breaks = seq(0, 5000, by = 100))
- -
## Warning in .hist1(raster(x, y[i]), maxpixels = maxpixels, main = main[y[i]], :
-## 5% of the raster cells were used. 100000 values used.
-
-## Warning in .hist1(raster(x, y[i]), maxpixels = maxpixels, main = main[y[i]], :
-## 5% of the raster cells were used. 100000 values used.
-
-## Warning in .hist1(raster(x, y[i]), maxpixels = maxpixels, main = main[y[i]], :
-## 5% of the raster cells were used. 100000 values used.
-
+
# histograms for all the bands in one window (automatic, if a RasterBrick is supplied)
+hist(Gewata, maxpixels=1000)
- +
@@ -994,18 +1097,24 @@

Introduction to Landsat data use -
pairs(gewata)
+
pairs(Gewata, maxpixels=1000)
- +

Note that both hist() and pairs() compute histograms and scatterplots based on a random sample of raster pixels. The size of this sample can be changed with the argument maxpixels in either function.

-

Calling pairs() on a RasterBrick reveals potential correlations between the layers themselves. In the case of bands 2-4 of the gewata subset, we can see that band 2 and 3 (in the visual part of the EM spectrum) are highly correlated, while band 4 contains significant non-redundant information.

+

Calling pairs() on a RasterBrick reveals potential correlations between the layers themselves. In the case of bands of the Gewata subset, we can see that band 3 and 5 (in the visual part of the EM spectrum) and bands 6 and 7 (in the near infrared part of the EM spectrum) are highly correlated. Similar correlation also exist between band 11 and 12. While band 8a contains significant non-redundant information.

+
+
-

Question 1: Given what we know about the location of these bands along the EM spectrum, how could these scatterplots be explained?

+

+Question 1: Given what we know about the location of these bands along the EM spectrum, how could these scatterplots be explained? +

+
+

ETM+ band 4 (nearly equivalent to band 5 in the Landsat 8 OLI sensor) is situated in the near infrared (NIR) region of the EM spectrum and is often used to describe vegetation-related features.

We observe a strong correlation between two of the Landsat bands of the gewata subset, but a very different distribution of values in band 4 (NIR). This distribution stems from the fact that vegetation reflects very highly in the NIR range, compared to the visual range of the EM spectrum. However, note that NIR reflectance saturates in very dense vegetated areas. A commonly used metric for assessing vegetation dynamics, the normalized difference vegetation index (NDVI), explained in the previous lesson, takes advantage of this fact and is computed from Landsat bands 3 (visible red) and 4 (near infra-red).

In the previous lesson, we explored several ways to calculate NDVI, using direct raster algebra, calc() or overlay(). Since we will be using NDVI again later in this tutorial, let’s calculate it again and store it in our workspace using overlay().

@@ -1014,187 +1123,89 @@

Introduction to Landsat data use R source
par(mfrow = c(1, 1)) # reset plotting window
-ndvi <- overlay(GewataB4, GewataB3, fun=function(x,y){(x-y)/(x+y)})
+ndvi <- overlay(Gewata$B8A, Gewata$B04, fun=function(x,y){(x-y)/(x+y)})
 plot(ndvi)
- + +
+

Aside from the advantages of calc() and overlay() regarding memory usage, an additional advantage of these functions is the fact that the result can be written immediately to the file by including the filename = "..." argument, which will allow you to write your results to file immediately, after which you can reload in subsequent sessions without having to repeat your analysis.

+
+
+
+

+Question 2: What is the advantage of including the NDVI layer in the classification? +

+
+
-

Aside from the advantages of calc() and overlay() regarding memory usage, an additional advantage of these functions is the fact that the result can be written immediately to file by including the filename = "..." argument, which will allow you to write your results to file immediately, after which you can reload in subsequent sessions without having to repeat your analysis.

Classifying raster data

One of the most important tasks in analysis of remote sensing image analysis is image classification. In classifying the image, we take the information contained in the various bands (possibly including other synthetic bands such as NDVI or principal components). There are two approaches for image classification: supervised and unsupervised. In this tutorial we will explore supervised classification based on the Random Forest method.

Supervised classification: Random Forest

The Random Forest classification algorithm is an ensemble learning method that is used for both classification and regression. In our case, we will use the method for classification purposes. Here, the Random Forest method takes random subsets from a training dataset and constructs classification trees using each of these subsets. Trees consist of branches and leaves.

Branches represent nodes of the decision trees, which are often thresholds defined for the measured (known) variables in the dataset. Leaves are the class labels assigned at the termini of the trees. Sampling many subsets at random will result in many trees being built. Classes are then assigned based on classes assigned by all of these trees based on a majority rule, as if each class assigned by a decision tree were considered to be a vote.

-

The figure below gives a simple demonstration of how the random forest method works in principle. For an introduction to the Random Forest algorithm, see this presentation.

-
- -
-

One major advantage of the Random Forest method is the fact that an Out Of the Bag (OOB) error estimate and an estimate of variable performance are performed. For each classification tree assembled, a fraction of the training data are left out and used to compute the error for each tree by predicting the class associated with that value and comparing with the already known class. This process results in a confusion matrix, which we will explore in our analysis. In addition an importance score is computed for each variable in two forms: the mean decrease in accuracy for each variable, and the Gini impurity criterion, which will also be explored in our analysis.

-

We should first prepare the data on which the classification will be done. So far, we have prepared three bands from an ETM+ image in 2001 (bands 2, 3 and 4) as a RasterBrick, and have also calculated NDVI. In addition, there is a Vegetation Continuous Field (VCF) product available for the same period (2000).

-

For more information on the Landsat VCF product, see here. This product is also based on Landsat ETM+ data, and represents an estimate of tree cover (in %). Since this layer could also be useful in classifying land cover types, we will also include it as a potential covariate in the Random Forest classification.

+

The figure below gives a simple demonstration of how the random forest method works in principle. For an introduction to the Random Forest algorithm, see this presentation. For more information on random forest implementation in R see this tutorial.

+

Schematic showing how the Random Forest method constructs classification trees from random subsets of a training dataset. Each tree determines the labels assigned based on the training dataset. Once all trees are assembled, classes are assigned to unknown pixels based on the class which receives the majority of votes based on all the decision trees constructed.

+

One major advantage of the Random Forest method is the fact that an Out Of the Bag (OOB) cross-validation error estimate and an estimate of variable performance are performed. For each classification tree assembled, a fraction of the training data are left out and used to compute the error for each tree by predicting the class associated with that value and comparing with the already known class. This process results in a confusion matrix, which we will explore in our analysis. In addition an importance score is computed for each variable in two forms: the mean decrease in accuracy for each variable, and the Gini impurity criterion, which will also be explored in our analysis.

+

To perform the classification in R, it is best to assemble all covariate layers (ie. those layers contaning predictor variable values) into one RasterBrick object. In this case, we can simply append the new layer (NDVI) to our existing RasterBrick (currently consisting of different bands).

+

First, let’s rescale the original reflectance values to their original scale. This step is not required for the RF classification, but it might help with the interpretation, if you are used to thinking of reflectance as a value between 0 and 1. (On the other hand, for very large raster bricks, it might be preferable to leave them in their integer scale, but we won’t go into more detail about that here.)

-
## Load the data and check it out
-load("data/vcfGewata.rda")
-vcfGewata
- -
## class      : RasterLayer 
-## dimensions : 1177, 1548, 1821996  (nrow, ncol, ncell)
-## resolution : 30, 30  (x, y)
-## extent     : 808755, 855195, 817635, 852945  (xmin, xmax, ymin, ymax)
-## crs        : +proj=utm +zone=36 +datum=WGS84 +units=m +no_defs +ellps=WGS84 +towgs84=0,0,0 
-## source     : memory
-## names      : vcf2000Gewata 
-## values     : 0, 254  (min, max)
-
- -
plot(vcfGewata)
-
-
- -
+
gewata <- calc(Gewata, fun=function(x) x / 10000)
+## Make a new RasterStack of covariates by 'stacking' together the existing gewata brick and NDVI
+covs <- stack(gewata, ndvi)
+plot(covs)
+

You’ll notice that we didn’t give our NDVI layer a name yet. It’s good to make sure that the raster layer names make sense, so you don’t forget which band is which later on. Let’s change all the layer names (make sure you get the order right!).

+
-
summary(vcfGewata)
+
names(covs) <- c(names(Gewata),"NDVI")
+names(covs)
-
##         vcf2000Gewata
-## Min.                0
-## 1st Qu.            32
-## Median             64
-## 3rd Qu.            75
-## Max.              254
-## NA's             8289
+
##  [1] "B02"  "B03"  "B04"  "B05"  "B06"  "B07"  "B11"  "B12"  "B8A"  "NDVI"
 
- -
hist(vcfGewata)
-
-
- -
-
-

In the vcfGewata rasterLayer there are some values much greater than 100 (the maximum tree cover), which are flags for water, cloud or cloud shadow pixels. To avoid these values, we can assign a value of NA to these pixels so they are not used in the classification.

+

Training data preparation

+

For this exercise, we will do a very simple classification for 2020 using three classes: forest, cropland and wetland. While for other purposes it is usually better to define more classes (and possibly fuse classes later), a simple classification like this one could be useful, for example, to construct a forest mask for the year 2020

-
vcfGewata[vcfGewata > 100] <- NA
-plot(vcfGewata)
-
-
- -
-
- -
summary(vcfGewata)
+
## we load the training polygons as a csv file using st_read:
+trainingPoly <- st_read("data/trainingPoly.csv")
-
##         vcf2000Gewata
-## Min.                0
-## 1st Qu.            32
-## Median             64
-## 3rd Qu.            75
-## Max.              100
-## NA's            13712
-
- -
hist(vcfGewata)
-
-
- -
-
-
-

To perform the classification in R, it is best to assemble all covariate layers (ie. those layers contaning predictor variable values) into one RasterBrick object. In this case, we can simply append these new layers (NDVI and VCF) to our existing RasterBrick (currently consisting of bands 2, 3, and 4).

-

First, let’s rescale the original reflectance values to their original scale. This step is not required for the RF classification, but it might help with the interpretation, if you are used to thinking of reflectance as a value between 0 and 1. (On the other hand, for very large raster bricks, it might be preferable to leave them in their integer scale, but we won’t go into more detail about that here.)

-
- -
gewata <- calc(gewata, fun=function(x) x / 10000)
- -
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
-## prefer_proj): Discarded datum Unknown based on WGS84 ellipsoid in CRS definition
+
## Reading layer `trainingPoly' from data source 
+##   `/home/dainius/Documents/geoscripting/AdvancedRasterAnalysis/data/trainingPoly.csv' 
+##   using driver `CSV'
+## Simple feature collection with 16 features and 3 fields
+## Geometry type: POLYGON
+## Dimension:     XY
+## Bounding box:  xmin: 813300.2 ymin: 822891.5 xmax: 846719.1 ymax: 848830.4
+## CRS:           NA
 
-
## Make a new RasterStack of covariates by 'stacking' together the existing gewata brick and NDVI and VCF layers
-covs <- stack(gewata, ndvi, vcfGewata)
-plot(covs)
-
-

You’ll notice that we didn’t give our NDVI layer a name yet. It’s good to make sure that the raster layer names make sense, so you don’t forget which band is which later on. Let’s change all the layer names (make sure you get the order right!).

-
- -
names(covs) <- c("band2", "band3", "band4", "NDVI", "VCF")
-plot(covs)
-
-
- -
-
-
-

Training data preparation

-

For this exercise, we will do a very simple classification for 2001 using three classes: forest, cropland and wetland. While for other purposes it is usually better to define more classes (and possibly fuse classes later), a simple classification like this one could be useful, for example, to construct a forest mask for the year 2001.

-
- -
## Load the training polygons as a simple features object
-trainingPoly <- readRDS("data/trainingPoly.rds")
-# Note that alternatively we could load a file that is not specific to R using st_read:
-#trainingPoly <- st_read("data/trainingPoly.csv")
-## Superimpose training polygons onto NDVI plot
+
## Superimpose training polygons onto NDVI plot
+par(mfrow = c(1, 1)) # reset plotting window
 plot(ndvi)
 plot(trainingPoly, add = TRUE)
- -
## Warning in plot.sf(trainingPoly, add = TRUE): ignoring all but the first
-## attribute
-
- +
@@ -1204,44 +1215,51 @@

Training data preparation

R source
## Inspect the trainingPoly object
+trainingPoly<-trainingPoly[,c(2:4)] #remove an unused column
 trainingPoly
## Simple feature collection with 16 features and 2 fields
-## geometry type:  POLYGON
-## dimension:      XY
-## bbox:           xmin: 813300.2 ymin: 822891.5 xmax: 846719.1 ymax: 848830.4
-## projected CRS:  UTM Zone 36, Northern Hemisphere
+## Geometry type: POLYGON
+## Dimension:     XY
+## Bounding box:  xmin: 813300.2 ymin: 822891.5 xmax: 846719.1 ymax: 848830.4
+## CRS:           NA
 ## First 10 features:
-##   OBJECTID   Class                       geometry
-## 0        1 wetland POLYGON ((821935.6 833523.3...
-## 1        2 wetland POLYGON ((824714 831022.8, ...
-## 2        3 wetland POLYGON ((830913.4 832637.4...
-## 3        4 wetland POLYGON ((832451.1 833023.5...
-## 4        5 wetland POLYGON ((834905.6 837919.2...
-## 5        6  forest POLYGON ((833159.4 846635.4...
-## 6        7  forest POLYGON ((831922.1 848830.4...
-## 7        8  forest POLYGON ((842202 832724.6, ...
-## 8        9  forest POLYGON ((840860.9 829199.5...
-## 9       10  forest POLYGON ((839926.8 824919.4...
+##    OBJECTID   Class                       geometry
+## 1         1 wetland POLYGON ((821935.6 833523.3...
+## 2         2 wetland POLYGON ((824714 831022.8, ...
+## 3         3 wetland POLYGON ((830913.4 832637.4...
+## 4         4 wetland POLYGON ((832451.1 833023.5...
+## 5         5 wetland POLYGON ((834905.6 837919.2...
+## 6         6  forest POLYGON ((833159.4 846635.4...
+## 7         7  forest POLYGON ((831922.1 848830.4...
+## 8         8  forest POLYGON ((842202 832724.6, ...
+## 9         9  forest POLYGON ((840860.9 829199.5...
+## 10       10  forest POLYGON ((839926.8 824919.4...
 
-
# The 'Class' column is actually a factor
-str(trainingPoly$Class)
+
# The 'Class' column is a character but should be converted to factor 
+summary(trainingPoly$Class)
-
##  Factor w/ 3 levels "cropland","forest",..: 3 3 3 3 3 2 2 2 2 2 ...
+
##    Length     Class      Mode 
+##        16 character character
 
-
# If you had to load the training polygons from a generic vector file and not an .RDS,
-# you would need to manually convert the 'Class' column to a factor, i.e.
-#trainingPoly$Class <- as.factor(trainingPoly$Class)
+
trainingPoly$Class <- as.factor(trainingPoly$Class)
+summary(trainingPoly$Class)
+ +
## cropland   forest  wetland 
+##        6        5        5
+
-
## Simple feature collection with 16 features and 3 fields
-## geometry type:  POLYGON
-## dimension:      XY
-## bbox:           xmin: 813300.2 ymin: 822891.5 xmax: 846719.1 ymax: 848830.4
-## projected CRS:  UTM Zone 36, Northern Hemisphere
-## First 10 features:
-##   OBJECTID   Class                       geometry Code
-## 0        1 wetland POLYGON ((821935.6 833523.3...    3
-## 1        2 wetland POLYGON ((824714 831022.8, ...    3
-## 2        3 wetland POLYGON ((830913.4 832637.4...    3
-## 3        4 wetland POLYGON ((832451.1 833023.5...    3
-## 4        5 wetland POLYGON ((834905.6 837919.2...    3
-## 5        6  forest POLYGON ((833159.4 846635.4...    2
-## 6        7  forest POLYGON ((831922.1 848830.4...    2
-## 7        8  forest POLYGON ((842202 832724.6, ...    2
-## 8        9  forest POLYGON ((840860.9 829199.5...    2
-## 9       10  forest POLYGON ((839926.8 824919.4...    2
+
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##   1.000   1.000   2.000   1.938   3.000   3.000
 
-

To train the raster data, we need to convert our training data to the same type using the rasterize() function. This function takes a spatial object (in this case a polygon object) and transfers the values to raster cells defined by a raster object. Here, we will define a new raster containing those values.

-
- -
## Assign 'Code' values to raster cells (where they overlap)
-classes <- rasterize(trainingPoly, ndvi, field='Code')
-
-
- -
-
-
-

-Note: there is a handy progress=“text” argument, which can be passed to many of the raster package functions and can help to monitor processing. Try passing this argument to therasterize() command above. -

-
-
-

Our goal in preprocessing these data is to have a table of values representing all layers (covariates) with known values/classes. To do this, we will first need to create a version of our RasterBrick only representing the training pixels. Here, we use the mask() function from the raster package.

-
- -
covmasked <- mask(covs, classes)
- -
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
-## prefer_proj): Discarded datum Unknown based on WGS84 ellipsoid in CRS definition
-
- -
plot(covmasked)
-
-
- -
-
- -
## Combine this new brick with the classes layer to make our input training dataset
-names(classes) <- "class"
-trainingstack <- addLayer(covmasked, classes)
-plot(trainingstack)
-
-
- -
+
-

Now we convert these data to a data.frame representing all training data. This data.frame will be used as an input into the RandomForest classification function. We will use getValues() to extract all of the values from the layers of the RasterBrick.

-
- -
## Extract all values into a matrix
-valuetable <- getValues(trainingstack)
-

If you print valuetable to the console, you will notice that a lot of the rows are filled with NA. This is because all raster cell values have been taken, include those with NA values. We can get rid of these rows by using the na.omit() function.

+

Our goal in preprocessing this data is to have a table of values representing all layers (covariates) with known values/classes. To do this, we will first need to know the values of the covariates at our training polygon locations. We can use extract function of raster package for this. Next we convert these data to a data.frame representing all training data.

-
valuetable <- na.omit(valuetable)
+
## Extract pixel values below the polygons
+trainingData<-extract(covs, trainingPoly)
+# Check structure
+str(trainingData)
+# It is a matrix (rows are pixels, columns are bands) per polygon
+# Convert each matrix to a data.frame and add the class column from the polygons
+valuetable <- lapply(1:length(trainingData), function(polygonID) cbind.data.frame(trainingData[[polygonID]], Class=trainingPoly$Class[polygonID]))
+# Unlist into one long data.frame
+valuetable <- do.call(rbind, valuetable)
-

Now we will convert the matrix to a data.frame and inspect the first and last 10 rows.

+

This data.frame will be used as an input into the RandomForest classification function. We then inspect the first and last 10 rows.

-
valuetable <- as.data.frame(valuetable)
-head(valuetable, n = 10)
+
head(valuetable, n = 10)
-
##     band2  band3  band4      NDVI VCF class
-## 1  0.0354 0.0281 0.2527 0.7998576  77     2
-## 2  0.0417 0.0301 0.2816 0.8068656  74     2
-## 3  0.0418 0.0282 0.2857 0.8203250  72     2
-## 4  0.0397 0.0282 0.2651 0.8077054  71     2
-## 5  0.0355 0.0263 0.2237 0.7896000  77     2
-## 6  0.0396 0.0281 0.2693 0.8110289  75     2
-## 7  0.0375 0.0300 0.2817 0.8075072  76     2
-## 8  0.0396 0.0263 0.2610 0.8169161  76     2
-## 9  0.0354 0.0263 0.2320 0.7963609  76     2
-## 10 0.0333 0.0263 0.2113 0.7786195  73     2
+
##       B02    B03    B04    B05    B06    B07    B11    B12    B8A      NDVI
+## 1  0.0492 0.0746 0.0680 0.1105 0.1999 0.2280 0.2425 0.1496 0.2608 0.5863747
+## 2  0.0518 0.0759 0.0705 0.1168 0.2093 0.2397 0.2505 0.1576 0.2706 0.5866315
+## 3  0.0523 0.0772 0.0710 0.1189 0.2073 0.2418 0.2501 0.1567 0.2717 0.5856434
+## 4  0.0482 0.0714 0.0674 0.1120 0.1971 0.2265 0.2331 0.1394 0.2617 0.5903981
+## 5  0.0476 0.0732 0.0675 0.1126 0.1959 0.2257 0.2364 0.1421 0.2586 0.5860166
+## 6  0.0481 0.0716 0.0671 0.1117 0.1932 0.2231 0.2360 0.1433 0.2550 0.5833592
+## 7  0.0483 0.0726 0.0678 0.1128 0.1997 0.2287 0.2398 0.1483 0.2559 0.5810936
+## 8  0.0517 0.0760 0.0700 0.1161 0.2057 0.2356 0.2484 0.1558 0.2678 0.5855536
+## 9  0.0528 0.0757 0.0717 0.1176 0.2049 0.2360 0.2501 0.1570 0.2699 0.5802108
+## 10 0.0502 0.0745 0.0679 0.1145 0.2065 0.2351 0.2480 0.1502 0.2676 0.5952310
+##      Class
+## 1  wetland
+## 2  wetland
+## 3  wetland
+## 4  wetland
+## 5  wetland
+## 6  wetland
+## 7  wetland
+## 8  wetland
+## 9  wetland
+## 10 wetland
 
-
##        band2  band3  band4      NDVI VCF class
-## 36211 0.0451 0.0293 0.2984 0.8211779  76     2
-## 36212 0.0406 0.0275 0.2561 0.8060649  76     2
-## 36213 0.0361 0.0293 0.2179 0.7629450  75     2
-## 36214 0.0406 0.0313 0.2222 0.7530572  74     2
-## 36215 0.0405 0.0313 0.2222 0.7530572  73     2
-## 36216 0.0406 0.0293 0.2646 0.8006124  79     2
-## 36217 0.0429 0.0293 0.2774 0.8089338  70     2
-## 36218 0.0451 0.0333 0.2900 0.7939994  77     2
-## 36219 0.0406 0.0293 0.2689 0.8034876  81     2
-## 36220 0.0429 0.0293 0.2434 0.7851118  73     2
+
##          B02    B03    B04    B05    B06    B07    B11    B12    B8A      NDVI
+## 87195 0.0352 0.0544 0.0563 0.0986 0.1997 0.2351 0.2193 0.1324 0.2652 0.6497667
+## 87196 0.0377 0.0586 0.0651 0.1053 0.1789 0.2219 0.2357 0.1534 0.2433 0.5778210
+## 87197 0.0400 0.0594 0.0736 0.1108 0.1637 0.1842 0.2491 0.1729 0.2144 0.4888889
+## 87198 0.0445 0.0750 0.0807 0.1316 0.2074 0.2370 0.2476 0.1602 0.2707 0.5406944
+## 87199 0.0334 0.0537 0.0528 0.0960 0.2105 0.2668 0.2187 0.1242 0.2930 0.6946212
+## 87200 0.0385 0.0653 0.0605 0.1127 0.2108 0.2402 0.2329 0.1482 0.2709 0.6348823
+## 87201 0.0475 0.0758 0.0835 0.1279 0.2009 0.2237 0.2624 0.1716 0.2552 0.5069383
+## 87202 0.0578 0.0941 0.1053 0.1617 0.2302 0.2592 0.2904 0.1873 0.2959 0.4750748
+## 87203 0.0342 0.0559 0.0585 0.0957 0.1683 0.1825 0.2304 0.1466 0.2152 0.5725247
+## 87204 0.0364 0.0588 0.0689 0.1169 0.1619 0.1887 0.2472 0.1672 0.2074 0.5012667
+##          Class
+## 87195 cropland
+## 87196 cropland
+## 87197 cropland
+## 87198 cropland
+## 87199 cropland
+## 87200 cropland
+## 87201 cropland
+## 87202 cropland
+## 87203 cropland
+## 87204 cropland
 
-

We have our training dataset as a data.frame, let’s convert the class column into a factor (since the values as integers don’t really have a meaning).

+

We have our training dataset as a data.frame with the class column as a factor. If it is integer, random forest regression will be run, instead of classification. So, good to check on that.

+

Now we have a convenient training data table which contains, for each of the three defined classes, values for all covariates. Let’s visualize the distribution of some of these covariates for each class. To make this easier, we will create 3 different data.frames for each of the classes. This is just for plotting purposes, and we will not use these in the actual classification.

-
valuetable$class <- factor(valuetable$class, levels = c(1:3))
-
-

Now we have a convenient training data table which contains, for each of the three defined classes,values for all covariates. Let’s visualize the distribution of some of these covariates for each class. To make this easier, we will create 3 different data.frames for each of the classes. This is just for plotting purposes, and we will not use these in the actual classification.

-
- -
val_crop <- subset(valuetable, class == 1)
-val_forest <- subset(valuetable, class == 2)
-val_wetland <- subset(valuetable, class == 3)
+
val_crop <- subset(valuetable, Class == "cropland")
+val_forest <- subset(valuetable, Class == "forest")
+val_wetland <- subset(valuetable, Class == "wetland")
 
 ## NDVI
 par(mfrow = c(3, 1))
-hist(val_crop$NDVI, main = "cropland", xlab = "NDVI", xlim = c(0, 1), ylim = c(0, 4000), col = "orange")
-hist(val_forest$NDVI, main = "forest", xlab = "NDVI", xlim = c(0, 1), ylim = c(0, 4000), col = "dark green")
-hist(val_wetland$NDVI, main = "wetland", xlab = "NDVI", xlim = c(0, 1), ylim = c(0, 4000), col = "light blue")
+hist(val_crop$NDVI, main = "cropland", xlab = "NDVI", xlim = c(0, 1), col = "orange") +hist(val_forest$NDVI, main = "forest", xlab = "NDVI", xlim = c(0, 1), col = "dark green") +hist(val_wetland$NDVI, main = "wetland", xlab = "NDVI", xlim = c(0, 1), col = "light blue")
- +
-

Other covariates such as VCF can also be plotted like above.

+

Other covariates such as the bands can also be plotted like above.

-
## 3. Bands 3 and 4 (scatterplots)
-plot(band4 ~ band3, data = val_crop, pch = ".", col = "orange", xlim = c(0, 0.2), ylim = c(0, 0.5))
-points(band4 ~ band3, data = val_forest, pch = ".", col = "dark green")
-points(band4 ~ band3, data = val_wetland, pch = ".", col = "light blue")
+
## 3. Bands 8a and 11 (scatterplots)
+plot(B8A ~ B11, data = val_crop, pch = ".", col = "orange", xlim = c(0, 0.4), ylim = c(0, 0.5))
+points(B8A ~ B11, data = val_forest, pch = ".", col = "dark green")
+points(B8A ~ B11, data = val_wetland, pch = ".", col = "light blue")
 legend("topright", legend=c("cropland", "forest", "wetland"), fill=c("orange", "dark green", "light blue"), bg="white")
- +
-
-

Try to produce the same scatterplot plot as in #3 looking at the relationship between other bands (e.g. bands 2 and 3, band 4 and VCF, etc.)

-
-

We can see from these distributions that these covariates may do well in classifying forest pixels, but we may expect some confusion between cropland and wetland (although the individual bands may help to separate these classes). You can save the training data using the write.csv() command, in case something goes wrong after this point and you need to start over again.

-

Run Random Forest classification

-

We build the Random Forest model using the training data. For this, we will use the randomForest package in R. Using the randomForest() function, we will build a model based on a matrix of predictors or covariates (ie. the first 5 columns of valuetable) related to the response (the class column of valuetable).

-
+
+

-Note: the randomForest package is the reference implementation of Random Forest in R. However, it is very slow. For any work beyond such a simple example, you should use the ranger package, which is implemented in C++ with multithreading and thus is much faster. +Question 3: Try to produce the same scatterplot plot as above looking at the relationship between other bands. Try B02 & B05, B07 & NDVI (you might have adjust the xlim to incorporate the NDVI values) and another of your choice. What can you say about the relationships between these bands? Which ones give a clear distinction between classes, and where is this less clear?

+
+

We can see from these distributions that these covariates may do well in classifying forest pixels, but we may expect some confusion between cropland and wetland (although the individual bands may help to separate these classes). You can save the training data using the write.csv() command, in case something goes wrong after this point and you need to start over again.

+

Run Random Forest classification

+

We build the Random Forest model using the training data. For this, we will use the ranger package in R. There is also randomForest package available in R. However, ranger is is implemented in C++ with multithreading and thus is much faster. Using the ranger() function, we will build a model based on a matrix of predictors or covariates (ie. the first 10 columns of valuetable) related to the response (the Class column of valuetable).

+

Construct a random forest model.

+

Covariates (x) are found in columns 1 to 10 of valuetable. Training classes (y) are found in the ‘class’ column of valuetable. Caution: this step takes fairly long! but can be shortened by setting importance=FALSE

-
## Construct a random forest model
-# Covariates (x) are found in columns 1 to 5 of valuetable
-# Training classes (y) are found in the 'class' column of valuetable
-## Caution: this step takes fairly long!
-# but can be shortened by setting importance=FALSE
-
-# Check for randomForest package and install if missing
-if(!"randomForest" %in% rownames(installed.packages())){install.packages("randomForest")}
-
-library(randomForest)
-modelRF <- randomForest(x=valuetable[ ,c(1:5)], y=valuetable$class,
-                        importance = TRUE)
+
library(ranger)
+modelRF <- ranger(x=valuetable[, 1:ncol(valuetable)-1], y=valuetable$Class,
+                  importance = "permutation", seed=0xfedbeef)
- -
## randomForest 4.6-14
-
- -
## Type rfNews() to see new features/changes/bug fixes.
-
+

Since the random forest method involves the building and testing of many classification trees (the ‘forest’), it is a computationally expensive step (and could take a lot of memory for especially large training datasets). When this step is finished, it would be a good idea to save the resulting object with the saveRDS() command. Any R object can be saved as an .rds file and reloaded into future sessions using readRDS().

@@ -1499,7 +1449,7 @@

Run Random Forest classification

-

The resulting object from the randomForest() function is a specialized object of class randomForest, which is a large list-type object packed full of information about the model output. Elements of this object can be called and inspected like any list object.

+

The resulting object from the ranger() function is a specialized object of class ranger, which is a large list-type object packed full of information about the model output. Elements of this object can be called and inspected like any list object.

-

Since we set importance=TRUE, we now also have information on the statistical importance of each of our covariates which we can visualize using the varImpPlot() command.

+

Earlier we provided a brief explanation of OOB error, though it can be a valuable metric for evaluating your model, it can overestimate the true prediction error depending on the parameters presents in the model.

+

Since we set importance="permutation", we now also have information on the statistical importance of each of our covariates which we can retrieve using the importance() command.

-
varImpPlot(modelRF)
-
-
- -
-
+
importance(modelRF)
+ +
##        B02        B03        B04        B05        B06        B07        B11 
+## 0.34201052 0.17654999 0.21132279 0.09419896 0.07904063 0.08446068 0.11889577 
+##        B12        B8A       NDVI 
+## 0.21622389 0.08892067 0.12864201
+
-

The figure above shows the variable importance plots for a Random Forest model showing the mean decrease in accuracy (left) and the decrease in Gini Impurity Coefficient (right) for each variable.

-

These two plots give two different reports on variable importance (see ?importance()).

-

First, the mean decrease in accuracy indicates the amount by which the classification accuracy decreased based on the OOB assessment. Second, the Gini impurity coefficient gives a measure of class homogeneity. More specifically, the decrease in the Gini impurity coefficient when including a particular variable is shown in the plot. From Wikipedia: “Gini impurity is a measure of how often a randomly chosen element from the set would be incorrectly labeled if it were randomly labeled according to the distribution of labels in the subset”.

-

In this case, it seems that Gewata bands 3 and 4 have the highest impact on accuracy, while bands 3 and 2 score highest with the Gini impurity criterion. For especially large datasets, it may be helpful to know this information, and leave out less important variables for subsequent runs of the randomForest() function.

-

Since the VCF layer included NAs (which have also been excluded in our results) and scores relatively low according to the mean accuracy decrease criterion, try to construct an alternate Random Forest model as above, but excluding this layer.

+

The above shows the variable importance for a Random Forest model showing the mean decrease in accuracy for each variable.

+

The mean decrease in accuracy indicates the amount by which the classification accuracy decreased based on the OOB assessment. In this case, it seems that Gewata bands 12 and 2 have the highest impact on accuracy. For large datasets, it may be helpful to know this information, and leave out less important variables for subsequent runs of the ranger() function.

+

Since the NDVI layer scores relatively low according to the mean accuracy decrease criterion, try to construct an alternate Random Forest model as above, but excluding this layer, you can name it something like ‘modelRF2’.

+
+
-

Question 2: What effect does this have on the overall accuracy of the results (hint: compare the confusion matrices of the original and new outputs). What effect does leaving this variable out have on the processing time (hint: use system.time())?

+

+Question 4: What effect does this have on the overall accuracy of the results (hint: compare the confusion matrices of the original and new outputs). What effect does leaving this variable out have on the processing time (hint: use system.time())? +

-

Now we apply this model to the rest of the image and assign classes to all pixels. Note that for this step, the names of the raster layers in the input brick (here covs) must correspond exactly to the column names of the training table. We will use the predict() function from the raster package. This function uses a pre-defined model to predict values of raster cells based on other raster layers. This model can be derived by a linear regression, for example. In our case, we will use the model provided by the randomForest() function.

+
+
+

Now we apply this model to the rest of the image and assign classes to all pixels. Note that for this step, the names of the raster layers in the input brick (here covs) must correspond exactly to the column names of the training table. We will use the predict() function from the raster package. This function uses a pre-defined model to predict values of raster cells based on other raster layers. This model can be derived by a linear regression, for example. In our case, we will use the model provided by the ranger() function.

- -
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
-## prefer_proj): Discarded datum Unknown based on WGS84 ellipsoid in CRS definition
-
+
-
## [1] "band2" "band3" "band4" "NDVI"  "VCF"
+
##  [1] "B02"  "B03"  "B04"  "B05"  "B06"  "B07"  "B11"  "B12"  "B8A"  "NDVI"
 
-
## [1] "band2" "band3" "band4" "NDVI"  "VCF"   "class"
+
##  [1] "B02"   "B03"   "B04"   "B05"   "B06"   "B07"   "B11"   "B12"   "B8A"  
+## [10] "NDVI"  "Class"
 
@@ -1571,7 +1521,7 @@

Run Random Forest classification

R source
## Predict land cover using the RF model
-predLC <- predict(covs, model=modelRF, na.rm=TRUE)
+predLC <-raster::predict(covs, modelRF, fun=function(...) predict(...)$predictions)
@@ -1604,7 +1554,7 @@

Applying a raster sieve by clumping plot(formask, col="dark green", legend = FALSE)
- +
@@ -1616,22 +1566,46 @@

Applying a raster sieve by clumping -
## Group raster cells into clumps based on the Queen's Case
-if(!file.exists(fn <- "data/clumformask.grd")) {
-  forestclumps <- clump(formask, directions=8, filename=fn)
-} else {
-  forestclumps <- raster(fn)
-}
- +
## 
+## Pridedamas paketas: 'igraph'
+
+ +
## Šis objektas yra užmaskuotas nuo 'package:raster':
+## 
+##     union
+
+ +
## Šie objektai yra užmaskuoti nuo 'package:stats':
+## 
+##     decompose, spectrum
+
+ -
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
-## prefer_proj): Discarded datum Unknown based on WGS84 ellipsoid in CRS definition
+
## Šis objektas yra užmaskuotas nuo 'package:base':
+## 
+##     union
 
-
plot(forestclumps, col=topo.colors(nrow(forestclumps)))
+
## Group raster cells into clumps based on the Queen's Case
+if(!file.exists(fn <- "data/clumformask.grd")) {
+  forestclumps <- clump(formask, directions=8, filename=fn)
+} else {
+  forestclumps <- raster(fn)
+}
+plot(forestclumps, col=topo.colors(nrow(forestclumps)))

When we inspect the frequency table with freq(), we can see the number of raster cells included in each of these clump IDs.

@@ -1668,7 +1642,7 @@

Applying a raster sieve by clumping plot(formask, ext=e, col="dark green", legend=FALSE, main='formask')

- +
-
load("data/lulcGewata.rda")
+
download.file("https://github.com/GeoScripting-WUR/AdvancedRasterAnalysis/releases/download/thematic-data/lulcGewata.zip", "data/lulcGewata.zip")
+unzip("data/lulcGewata.zip", exdir="data")
+lulcGewata <- raster("data/lulcGewata.tif")
 ## Check out the distribution of the values
 freq(lulcGewata)
hist(lulcGewata)
+ +
## Warning in .hist1(x, maxpixels = maxpixels, main = main, plot = plot, ...): 5%
+## of the raster cells were used. 100000 values used.
+
- +
@@ -1726,7 +1714,8 @@

Working with thematic rasters

-
load("data/LUTGewata.rda")
+
LUTGewata <- read.csv("data/LUTGewata.csv")
+LUTGewata[[1]] <- NULL # Remove the name column
 LUTGewata
- +