ncss-tech · brownag · Oct 16, 2022
diff --git a/R/NCSP.R b/R/NCSP.R
@@ -495,3 +495,265 @@ NCSP <- function(
 
 }
 
+
+.NCSP2 <- function(
+    x, 
+    vars, 
+    fm = NULL, 
+    weights = rep(1, times = length(vars)), 
+    maxDepth = max(x), 
+    k = 0, 
+    isColor = FALSE,
+    rescaleResult = FALSE, 
+    progress = TRUE,
+    verbose = TRUE, 
+    returnDepthDistances = FALSE
+) {
+
+
+  ## deprecated arguments
+
+  # filter
+  # replace_na
+  # add_soil_flag
+  # strict_hz_eval
+  # plot.depth.matrix
+
+
+  ## sanity checks
+
+  # x must be a SPC
+  if(! inherits(x, 'SoilProfileCollection')) {
+    stop('`x` must be a SoilProfileCollection', call. = FALSE)
+  }
+
+  # vars
+  if(! all(vars %in% names(x))) {
+    stop('`vars` must specify horizon or site level attributes of `x`', call. = FALSE)
+  }
+
+  ## TODO: consider a message and setting maxDepth <- max(x)
+  # maxDepth
+  if(maxDepth > max(x) | maxDepth < 1) {
+    stop('`maxDepth` should be > 0 and <= max(`x`)', call. = FALSE)
+  }
+
+  # color comparisons require farver pkg for dE00
+  if(isColor & !requireNamespace('farver', quietly = TRUE)) {
+    stop('color comparison requires the `farver` package', call. =FALSE)
+
+    # for now, color comparisons are based on CIELAB color coordinates
+    # must be named "L", "A", "B" and in that order
+    if(any(vars != c('L', 'A', 'B'))) {
+      stop('CIELAB color coordinates must be specified as `L`, `A`, `B` in `vars`', call. =FALSE)  
+    }
+  }
+
+
+
+
+  ## TODO: reconsider hard-coded top depth
+  ## truncate at maxDepth
+  x <- trunc(x, 0, maxDepth)
+
+
+
+  ## variables used in NCSP algorithm
+
+  # number of variables
+  n.vars <- length(vars)
+
+  # compute a weighting vector based on k
+  # indexed to distance matrix list
+  w <- 1 * exp(-k * 1:maxDepth)
+
+  ## split horizon / site vars
+  hn <- horizonNames(x)
+  sn <- siteNames(x)
+  h.vars <- intersect(hn, vars)
+  s.vars <- intersect(sn, vars)
+
+
+  ## TODO: if there are any site data, extract those here
+  ##       and compute simple distance via cluster::daisy(, metric = 'gower')
+  ##       be sure to extract associated weights
+
+  ## TODO: refactor old code here
+
+  #   # check for any site data, remove and a save for later
+  #   if(any(vars %in% sn)) {
+  #     
+  #     # extract site-level vars
+  #     matching.idx <- na.omit(match(sn, vars))
+  #     site.vars <- vars[matching.idx]
+  #     
+  #     # remove from hz-level vars
+  #     vars <- vars[-matching.idx]
+  #     
+  #     ## TODO: BUG!!! horizon data are rescaled via D/max(D) !!!
+  #     ## TODO: allow user to pass-in variable type information
+  #     # compute dissimilarty on site-level data: only works with 2 or more variables
+  #     # rescale to [0,1]
+  #     
+  #     message(paste('site-level variables included:', paste(site.vars, collapse=', ')))
+  #     d.site <- daisy(s.site[, site.vars, drop=FALSE], metric='gower')
+  #     
+  #     # re-scale to [0,1]
+  #     d.site <- .rescaleRange(d.site, x0 = 0, x1 = 1)
+  #     
+  #     # reset default behavior of hz-level D
+  #     rescale.result=TRUE
+  #     
+  #     ## TODO: there might be cases where we get an NA in d.site ... seems like it happens with boolean variables
+  #     ## ... but why ? read-up on daisy
+  #     if(any(is.na(d.site))) {
+  #       warning('NA in site-level dissimilarity matrix, replacing with min dissimilarity', call.=FALSE)
+  #       d.site[which(is.na(d.site))] <- min(d.site, na.rm=TRUE)
+  #     }
+  #     
+  #     ## TODO: ordering of D_hz vs D_site ... assumptions safe?
+  #     
+  #   } else {
+  #     # setup a dummy D_site
+  #     d.site <- NULL
+  #   }
+
+
+
+  ## TODO: expose LHS of formula to dice()
+  ## NOTE: dice() LHS usually starts from 0, sliceSequence and soil.matrix are indexed from 1
+
+  ## TODO: consider exposing depth logic subset options in arguments to NCSP
+  ##       for now, entire profiles are subset
+
+
+  ## dice according to depth sequence and vars
+  # preserve SPC for access to all site data
+  # it could be better to perform sanity checks on horizonation outside of this function
+  .seq <- seq(from = 0, to = maxDepth - 1, by = 1)
+  .fm <- as.formula(
+    sprintf('c(%s) ~ %s',
+            paste0(.seq, collapse = ','),
+            paste0(h.vars, collapse = ' + '))
+  )
+
+  # reset removed.profiles metadata, it may have been set prior to calling NCSP()
+  metadata(x)$removed.profiles <- NULL
+
+  ## dice
+  # pctMissing is used to develop soil/non-soil matrix
+  s <- suppressMessages(dice(x, fm = .fm, SPC = TRUE, fill = TRUE, byhz = FALSE, pctMissing = TRUE, strict = TRUE))
+
+  # number of profiles, accounting for subset via dice()
+  n.profiles <- length(s)
+  # profile IDs
+  .ids <- profile_id(s)
+
+  # keep track of removed profiles, due to hz logic errors
+  .removed.profiles <- metadata(s)$removed.profiles
+  if(length(nchar(.removed.profiles)) > 0) {
+    warning('hz depth logic subset has removed some profiles')
+  }
+
+  ## slice sequence
+  sliceSequence <- 1:max(s)
+
+  ## add soil flag, previously optional now required
+  # using same data structure / conventions as profile_compare()
+  # logical matrix [sliceSequence, 1:n.profiles]
+  #                       [slices, profiles]
+
+  ## TODO: ensure that in-line NA are correctly handled
+  ## TODO: in-line NA may require another argument to determine assumptions
+  ## TODO: define soil / non-soil using pattern matching on horizon designation
+
+  # use a vector of horizon (slice) indices to extract all pctMissing values
+  # develop a matrix from these
+  soil.matrix <- matrix(
+    s[, sliceSequence][['.pctMissing']], 
+    ncol = n.profiles, 
+    nrow = length(sliceSequence), 
+    byrow = FALSE
+  )
+
+  # "soil" has a pctMising very close to 0
+  soil.matrix <- soil.matrix < 0.00001
+  # keep track of profile IDs
+  dimnames(soil.matrix)[[2]] <- .ids
+
+
+  ## evaluate distances by slice
+  ## accounting for soil/non-soil comparisons
+  ## filling NA due to missing data
+
+  ## TODO: basic progress reporting
+  ## TODO: cache identical slices
+  ## TODO: if !returnDepthDistances: do not retain full list of dist mat, accumulate in single variable
+
+  message(paste('Computing dissimilarity matrices from', n.profiles, 'profiles'), appendLF = FALSE)
+
+  .sall <- as.data.table(horizons(s))[, .SD, .SDcols = vars]
+
+  # SPC j index is overkill; each profile is known to have same number of slices...
+  # .v <- vapply(sliceSequence, FUN.VALUE = numeric(length(s)), function(i) s[, i, .HZID])
+
+  # much more efficient indexing given assumption of equal number of slices/profile
+  .maxSlice <- max(sliceSequence)
+  .sliceStarts <-  which(seq_len(nrow(.sall)) %% .maxSlice == 0) - .maxSlice
+  .v <- vapply(sliceSequence, FUN.VALUE = numeric(length(s)), function(i) .sliceStarts + i)
+
+  # create dissimilarity matrix for each slice
+  .ds <- data.table(.sliceID = sliceSequence)
+  .d <- .ds[, list(list(
+    .NCSP_distanceCalc(
+      m = .sall[.v[, .sliceID], ],
+      sm = soil.matrix[.sliceID,],
+      w = weights,
+      isColor = isColor
+    ) * w[.sliceID])), by = ".sliceID"]$V1
+
+  ## optionally return list of distance matrices
+  if(returnDepthDistances) {
+    return(.d)
+  }
+
+  # print total size of D
+  message(paste(" [", signif(object.size(.d) / 1024^2, 1), " Mb]", sep=''))
+
+  ## flatten list of distance matrices
+  .d <- Reduce('+', .d)
+
+
+  ## optionally normalize by dividing by max(D)
+  # this is important when incorporating site data
+  # TODO: causes problems for some functions like MASS::sammon() ?
+  if(rescaleResult) {
+    .d <- .d / max(.d, na.rm = TRUE)
+  }
+
+
+
+  ## metadata
+
+  # distance metric
+  if(isColor) {
+    attr(.d, 'Distance Metric') <- 'CIE2000'
+  } else {
+    attr(.d, 'Distance Metric') <- 'Gower'
+  }
+
+  # removed profiles, if any
+  attr(.d, 'removed.profiles') <- .removed.profiles
+
+  # remove warnings about NA from cluster::daisy()
+  attr(.d, 'NA.message') <- NULL
+
+  # full set of profiles replaced with profile IDs stored in attr(.d, 'Labels')
+  attr(.d, 'Labels') <- .ids
+
+  # done
+  return(.d)
+
+}
+
+
diff --git a/misc/NCSP/optimize.R b/misc/NCSP/optimize.R
@@ -0,0 +1,132 @@
+# NCSP() optimization
+
+# converted for-loop into data.table call for slice-wise .NCSP_distanceCalc
+
+# costly operations in .NCSP_distanceCalc
+#  - daisy() for each slice is the primary cost (~1/3 execution time)
+#  - outer() twice for each slice adds up 
+#  - conversion from dist->matrix->dist is somewhat costly (requires 2+ copy; adds up)
+
+# opportunities
+#  - can daisy/outer/outer/matrix conversion etc. be called only once--for all slices?
+
+library(aqp)
+#> This is aqp 2.0
+
+# setup horizon-level data: data are from lab sampled pedons
+d <- read.csv(
+  textConnection('series,top,bottom,clay,frags,ph
+auburn,0,3,21,6,5.6
+auburn,3,15,21,13,5.6
+auburn,15,25,20,9,5.8
+auburn,25,47,21,28,5.8
+dunstone,0,5,16,13,6
+dunstone,5,17,17,19,6.3
+dunstone,17,31,20,6,6.3
+dunstone,31,41,21,15,6.3
+sobrante,0,5,18,0,5.8
+sobrante,5,10,16,2,5.7
+sobrante,10,28,15,21,5.8
+sobrante,28,51,18,13,6.2
+sobrante,51,74,20,12,6.2'))
+
+# establish site-level data
+s <- data.frame(
+  series = c('auburn', 'dunstone', 'sobrante'), 
+  precip = c(24, 30, 32)
+)
+
+# generate fake horizon names with clay / frags / ph
+d$name <- with(d, paste(clay, frags, ph, sep='/'))
+
+# upgrade to SoilProfile Collection object
+depths(d) <- series ~ top + bottom
+site(d) <- s
+hzdesgnname(d) <- 'name'
+
+
+bench::mark(
+  memory = FALSE,
+  before = NCSP(
+    d,
+    vars = c('clay', 'ph', 'frags'),
+    k = 0,
+    maxDepth = 61
+  ),
+  after = aqp:::.NCSP2(
+    d,
+    vars = c('clay', 'ph', 'frags'),
+    k = 0,
+    maxDepth = 61
+  )
+)
+#> Computing dissimilarity matrices from 3 profiles
+#>  [0.09 Mb]
+#> Computing dissimilarity matrices from 3 profiles [0.09 Mb]
+#> Computing dissimilarity matrices from 3 profiles [0.09 Mb]
+#> Computing dissimilarity matrices from 3 profiles [0.09 Mb]
+#> Computing dissimilarity matrices from 3 profiles [0.09 Mb]
+#> Computing dissimilarity matrices from 3 profiles [0.09 Mb]
+#> Computing dissimilarity matrices from 3 profiles [0.09 Mb]
+#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
+#> # A tibble: 2 × 6
+#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
+#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
+#> 1 before        517ms    517ms      1.93        NA     3.87
+#> 2 after         114ms    131ms      7.78        NA     3.89
+
+set.seed(123)
+x <- aqp::combine(lapply(1:100, random_profile, SPC = TRUE))
+bench::mark(
+  memory = FALSE,
+  before = NCSP(
+    x,
+    vars = c('p1', 'p2', 'p3', 'p4', 'p5'),
+    k = 0,
+    maxDepth = 143
+  ),
+  after = aqp:::.NCSP2(
+    x,
+    vars = c('p1', 'p2', 'p3', 'p4', 'p5'),
+    k = 0,
+    maxDepth = 143
+  )
+)
+#> Computing dissimilarity matrices from 100 profiles [6 Mb]
+#> Computing dissimilarity matrices from 100 profiles [6 Mb]
+#> Computing dissimilarity matrices from 100 profiles [6 Mb]
+#> Computing dissimilarity matrices from 100 profiles [6 Mb]
+#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
+#> # A tibble: 2 × 6
+#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
+#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
+#> 1 before        2.52s    2.52s     0.397        NA     18.3
+#> 2 after            1s       1s     0.999        NA     14.0
+
+set.seed(123)
+x <- aqp::combine(lapply(1:1000, random_profile, SPC = TRUE))
+bench::mark(
+  memory = FALSE,
+  before = NCSP(
+    x,
+    vars = c('p1', 'p2', 'p3', 'p4', 'p5'),
+    k = 0,
+    maxDepth = 177
+  ),
+  after = aqp:::.NCSP2(
+    x,
+    vars = c('p1', 'p2', 'p3', 'p4', 'p5'),
+    k = 0,
+    maxDepth = 177
+  )
+)
+#> Computing dissimilarity matrices from 1000 profiles [700 Mb]
+#> Computing dissimilarity matrices from 1000 profiles [700 Mb]
+#> Computing dissimilarity matrices from 1000 profiles [700 Mb]
+#> Computing dissimilarity matrices from 1000 profiles [700 Mb]
+#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
+#> # A tibble: 2 × 6
+#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
+#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
+#> 1 before        52.5s    52.5s    0.0191        NA     3.03
+#> 2 after         40.9s    40.9s    0.0244        NA     3.86