# Author: Ewen Gallic # Date: November 17th, 2016 # ----------------------------------- # # KERNEL DENSITY ESTIMATION FUNCTIONS # # ----------------------------------- # # @n: number of points to define the circle # @centre: center of the circle # @radius: radius of the circle sCircle <- function(n = 100, centre = c(0, 0), radius){ theta <- seq(0, 2*pi, length = n) m <- cbind(cos(theta), sin(theta)) * radius m[, 1] <- m[, 1] + centre[1] m[, 2] <- m[, 2] + centre[2] colnames(m) <- c("x", "y") m }# End of sCircle() # @x: center of the circle, # @h: bandwidth scalar, # @polygon: polygon on which data points lie. sWeights <- function(x, h, polygon) { leCercle <- sCircle(centre = x, radius = 1.759*h) POLcercle <- as(leCercle[-nrow(leCercle),], "gpc.poly") return(area.poly(gpclib::intersect(polygon, POLcercle)) / area.poly(POLcercle)) }# End of sWeights() # @U: data points, # @polygon: polygon on which points lie, # @optimal: if TRUE, uses Hpi() to select the optimal bandwidth, # @h: only if optimal=FALSE, scalar bandwidth, # @parallel: if TRUE, computes the weights using SNOW clusters, # @n_clusters: only if n_clusters=TRUE, defines the number of clusters. sKDE <- function(U, polygon, optimal = TRUE, h = .1, parallel = FALSE, n_clusters = 4){ if(!class(polygon) == "gpc.poly") polygon <- as(polygon, "gpc.poly") if(class(U) == "data.frame") U <- as.matrix(U) IND <- which(is.na(U[, 1]) == FALSE) U <- U[IND,] n <- nrow(U) if(optimal){ H <- Hpi(U, binned = FALSE) H <- matrix(c(sqrt(H[1, 1] * H[2, 2]), 0, 0, sqrt(H[1, 1] * H[2, 2])), 2, 2) } if(!optimal){ H <- matrix(c(h, 0, 0, h), 2, 2) } # Help function to compute weights poidsU <- function(i, U, h, POL){ x <- as.numeric(U[i,]) sWeights(x, h, POL) } # Use parallel methods to compute if the number of observation is a bit high # Change the number of slaves according to the number of cores your processor has # It is recommended to use a maximum of the number of cores minus one. if(parallel){ cl <- makeCluster(n_clusters, type = "SOCK") clusterEvalQ(cl, library("gpclib")) clusterEvalQ(cl, library("sp")) clusterExport(cl, c("sCircle", "sWeights")) OMEGA <- parLapply(cl, 1:n, poidsU, U = U, h = sqrt(H[1, 1]), POL = polygon) OMEGA <- do.call("c", OMEGA) stopCluster(cl) }else{ OMEGA <- NULL for(i in 1:n){ OMEGA <- c(OMEGA, poidsU(i, U, h = sqrt(H[1, 1]), POL = polygon)) } } # Kernel Density Estimator fhat <- kde(U, H, w = 1/OMEGA, xmin = c(min(get.bbox(polygon)$x), min(get.bbox(polygon)$y)), xmax = c(max(get.bbox(polygon)$x), max(get.bbox(polygon)$y))) fhat$estimate <- fhat$estimate * sum(1/OMEGA) / n vx <- unlist(fhat$eval.points[1]) vy <- unlist(fhat$eval.points[2]) VX <- cbind(rep(vx, each = length(vy))) VY <- cbind(rep(vy, length(vx))) VXY <- cbind(VX, VY) Ind <- matrix(inside.gpc.poly(x = VX, y = VY, polyregion = polygon), length(vy), length(vx)) f0 <- fhat f0$estimate[t(Ind) == 0] <- NA list( X = fhat$eval.points[[1]], Y = fhat$eval.points[[2]], Z = fhat$estimate, ZNA = f0$estimate, H = fhat$H, W = fhat$w) }# End of sKDE() # @U: data points, # @polygon: polygon on which points lie, # @optimal: if TRUE, uses Hpi() to select the optimal bandwidth, # @h: only if optimal=FALSE, scalar bandwidth, sKDE_without_c = function(U, polygon, optimal = TRUE, h = .1){ polygon <- as(polygon, "gpc.poly") IND <- which(is.na(U[,1]) == FALSE) U <- U[IND,] n <- nrow(U) if(optimal){ H <- Hpi(U,binned=FALSE) H <- matrix(c(sqrt(H[1, 1] * H[2, 2]), 0, 0, sqrt(H[1, 1] * H[2, 2])), 2, 2) } if(!optimal){ H <- matrix(c(h, 0, 0, h), 2, 2) } # Kernel density estimator fhat <- kde(U, H, xmin = c(min(get.bbox(polygon)$x), min(get.bbox(polygon)$y)), xmax = c(max(get.bbox(polygon)$x), max(get.bbox(polygon)$y))) vx <- unlist(fhat$eval.points[1]) vy <- unlist(fhat$eval.points[2]) VX <- cbind(rep(vx, each = length(vy))) VY <- cbind(rep(vy, length(vx))) VXY <- cbind(VX,VY) Ind <- matrix(inside.gpc.poly(x = VX, y = VY, polyregion = polygon), length(vy), length(vx)) f0 <- fhat f0$estimate[t(Ind) == 0] <- NA list( X = fhat$eval.points[[1]], Y = fhat$eval.points[[2]], Z = fhat$estimate, ZNA = f0$estimate, H = fhat$H, W = fhat$W) }# End of sKDE_without_c() # @smooth : result from sKDE() or sKDE_without_c(); # @breaks : breaks for the legend (seq(min(smooth$Z)*.95,max(smooth$Z)*1.05,length=21) by default); # @polygon : polygon on which data points lie; # @coord : coordinates (X, Y) of data points; # @alpha_coords : transparency for data points (.8 by default); # @size_coords : size for data points (.8 by default); # @many_points : if TRUE, @coord must be the result of condense() (package bigvis). It is helpful when there are too many points to display (FALSE by default); # @colContour : colour of the contour of the polygon ("white" by default); # @colPoints : colour of the data points ("dodger blue" by default); # @title : title (if provided) to give to the plot; # @contour : if FALSE, contour are not plotted (TRUE by default); # @round : round value for the legend (2 by default); # @text_size : text size (22 by default). plot_sKDE <- function(smooth, breaks, polygon, coord, alpha_coords = .8, size_coords = .8, many_points = FALSE, colContour="white", colPoints="dodger blue", title, contour=TRUE, round = 2, text_size = 22){ # Get the right format for ggplot2 obtenirMelt <- function(smoothed){ res <- melt(smoothed$ZNA) res[,1] <- smoothed$X[res[,1]] res[,2] <- smoothed$Y[res[,2]] names(res) <- list("X","Y","ZNA") return(res) } smCont <- obtenirMelt(smooth) if(missing(breaks)) breaks <- seq(min(smooth$Z)*.95,max(smooth$Z)*1.05,length=21) smCont$colour <- cut(smCont[,"ZNA"],breaks=breaks,labels=round(breaks[-1],digits=round)) smCont$colour2 <- as.character(cut(smCont[,"ZNA"],breaks=breaks,labels=rev(heat.colors(length(breaks)-1)))) if(is.null(polygon$group)) polygon$group <- factor(1) P <- ggplot() + geom_polygon(data = polygon, aes(x = x, y = y, group = group), fill = NA, col = "black") + geom_tile(aes(x = X, y = Y, fill = ZNA), alpha = .9, data = smCont[!is.na(smCont$ZNA),], na.rm=TRUE) lesLabels <- round(breaks,round) lesIndicesLabels <- floor(seq(1,length(lesLabels),length.out=5)) # Only keep 5 points for the legend values lesIndicesLabels[length(lesIndicesLabels)] <- length(lesLabels) # Making sure we display the last value lesLabels <- as.character(lesLabels[lesIndicesLabels]) lesLabels[lesLabels=="0"] <- "0.00" if(contour) P <- P + geom_contour(data = smCont[!is.na(smCont$ZNA),], aes(x = X, y = Y, z = ZNA), alpha=0.6, colour = colContour, breaks = breaks[lesIndicesLabels]) if(many_points){ P <- P + geom_point(data = coord, aes(x = X, y = Y, alpha = .count), col = "blue", size = size_coords) + scale_alpha_continuous(guide=FALSE) }else{ P <- P + geom_point(data = coord[,c("X", "Y")], aes(x = X, y = Y), alpha = alpha_coords, col = "blue", size = size_coords) } if(contour){ # To add contour levels ind_level <- which(unlist(lapply(ggplot_build(P)$data, function(x) "level" %in% colnames(x)))) tmp <- ggplot_build(P)$data[[ind_level]] ind <- unlist(lapply(unique(tmp$piece), function(x){ corresp <- which(tmp$piece == x) corresp[round(length(corresp)/2)] })) tmp$level_r <- round(tmp$level, round) P <- P + geom_text(aes(label = level_r, z = NULL, x = x, y = y), data=tmp[ind,]) } P <- P + scale_fill_gradient(name="",low='yellow', high='red', breaks=breaks[lesIndicesLabels], limits=range(breaks),labels=lesLabels) P <- P + theme(axis.text.x=element_blank(), axis.text.y=element_blank(), axis.ticks.x=element_blank(), axis.ticks.y=element_blank(), axis.title=element_blank(), text = element_text(size = text_size)) P <- P + geom_polygon(data=polygon, mapping=(aes(x=x, y=y)), colour="black", fill=NA) # Add a title if one was provided if(!missing(title)) P <- P + ggtitle(title) P } library(rgdal) library(surveillance) library(maptools) library(ggplot2) library(plyr) library(ellipse) library(fields) library(gpclib) library(ks) library(maps) library(rgeos) library(snow) library(sp) library(ggmap) library(reshape2) # --------- # # LOAD DATA # # --------- # # Source: Here’s Waldo: Computing the optimal search strategy for finding Waldo (2015), Randal S. Olson # URL: http://www.randalolson.com/2015/02/03/heres-waldo-computing-the-optimal-search-strategy-for-finding-waldo/ waldo <- read.csv("wheres-waldo-locations.csv") head(waldo) range(waldo$X) # Let's say 0:13 range(waldo$Y) # Let's say 0:8 # Plot of Waldo's locations p <- ggplot(data = waldo, aes(x = X, y = Y)) + geom_point() + scale_x_continuous(breaks = seq(0,13)) + scale_y_continuous(breaks = seq(0,8)) # ------------- # # THE RECTANGLE # # ------------- # # The rectangle on which we look for Waldo bounding_polygon <- data.frame(x = c(0, 0, 13, 13, 0), y = c(0, 8, 8, 0, 0)) # As a gpc.poly object bounding_polygon_gpc <- as(bounding_polygon, "gpc.poly") # ---------- # # ESTIMATION # # ---------- # # Estimation with border correction smoothed_kde <- sKDE(U = waldo[, c("X", "Y")], polygon = bounding_polygon_gpc, optimal=TRUE, parallel = FALSE) # Estimation without border correction smoothed_kde_nc <- sKDE_without_c(U = waldo[, c("X", "Y")], polygon = bounding_polygon_gpc, optimal=TRUE) # ------ # # GRAPHS # # ------ # p <- plot_sKDE(smooth = smoothed_kde, coord = waldo[, c("X", "Y")], alpha_coords = .8, size_coords = 1, breaks = seq(min(smoothed_kde$ZNA, smoothed_kde_nc$ZNA,na.rm=T)*.95, max(smoothed_kde$ZNA, smoothed_kde_nc$ZNA,na.rm=T)*1.05, length=21), polygon = bounding_polygon, round = 3, colContour = "black") + ggtitle("With correction") + coord_equal() p p_nc <- plot_sKDE(smooth = smoothed_kde_nc, coord = waldo[, c("X", "Y")], alpha_coords = .8, size_coords = 1, breaks = seq(min(smoothed_kde$ZNA, smoothed_kde_nc$ZNA,na.rm=T)*.95, max(smoothed_kde$ZNA, smoothed_kde_nc$ZNA,na.rm=T)*1.05, length=21), polygon = bounding_polygon, round = 3, colContour = "black") + ggtitle("Without correction") + coord_equal() p_nc # -------------------------------- # # TAKING THE POSTCARD INTO ACCOUNT # # -------------------------------- # # Actually, there is a postcard in the top left corner # Let's create it postcard <- data.frame(x = c(1, 1, 3, 3, 1), y = c(5, 7.5, 7.5, 5, 5)) # As a gpc.poly object postcard_gpc <- as(postcard, "gpc.poly") ggplot(data = waldo, aes(x = X, y = Y)) + geom_point() + scale_x_continuous(breaks = seq(0,13)) + scale_y_continuous(breaks = seq(0,8)) + geom_polygon(data = postcard, aes(x = x, y = y), fill = NA, col = "blue") # Remove the postcard from the bounding polygon bound_pol <- gpclib::setdiff(bounding_polygon_gpc, postcard_gpc) # ------------------------------------------- # # ESTIMATION TAKING THE POSTCARD INTO ACCOUNT # # ------------------------------------------- # # Estimation with correction of the border bias smoothed_kde_post <- sKDE(U = waldo[, c("X", "Y")], polygon = bound_pol, optimal=TRUE, parallel = FALSE) # Estimation without the correction smoothed_kde_post_nc <- sKDE_without_c(U = waldo[, c("X", "Y")], polygon = bound_pol, optimal=TRUE) # ------ # # GRAPHS # # ------ # p_post <- plot_sKDE(smooth = smoothed_kde_post, coord = waldo[, c("X", "Y")], alpha_coords = .8, size_coords = 1, breaks = seq(min(smoothed_kde_post$ZNA, smoothed_kde_post_nc$ZNA,na.rm=T)*.95, max(smoothed_kde_post$ZNA, smoothed_kde_post_nc$ZNA,na.rm=T)*1.05, length=21), polygon = bounding_polygon, round = 3, colContour = "black") + geom_polygon(data = postcard, aes(x = x, y = y), fill = NA, colour = "black") + ggtitle("With correction") + coord_equal() p_post p_post_nc <- plot_sKDE(smooth = smoothed_kde_post_nc, coord = waldo[, c("X", "Y")], alpha_coords = .8, size_coords = 1, breaks = seq(min(smoothed_kde_post$ZNA, smoothed_kde_post_nc$ZNA,na.rm=T)*.95, max(smoothed_kde_post$ZNA, smoothed_kde_post_nc$ZNA,na.rm=T)*1.05, length=21), polygon = bounding_polygon, round = 3, colContour = "black") + geom_polygon(data = postcard, aes(x = x, y = y), fill = NA, colour = "black") + ggtitle("Without correction") + coord_equal() p_post_nc