Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Try optimizing NCSP() #270

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
262 changes: 262 additions & 0 deletions R/NCSP.R
Original file line number Diff line number Diff line change
Expand Up @@ -495,3 +495,265 @@ NCSP <- function(

}


.NCSP2 <- function(
x,
vars,
fm = NULL,
weights = rep(1, times = length(vars)),
maxDepth = max(x),
k = 0,
isColor = FALSE,
rescaleResult = FALSE,
progress = TRUE,
verbose = TRUE,
returnDepthDistances = FALSE
) {


## deprecated arguments

# filter
# replace_na
# add_soil_flag
# strict_hz_eval
# plot.depth.matrix


## sanity checks

# x must be a SPC
if(! inherits(x, 'SoilProfileCollection')) {
stop('`x` must be a SoilProfileCollection', call. = FALSE)
}

# vars
if(! all(vars %in% names(x))) {
stop('`vars` must specify horizon or site level attributes of `x`', call. = FALSE)
}

## TODO: consider a message and setting maxDepth <- max(x)
# maxDepth
if(maxDepth > max(x) | maxDepth < 1) {
stop('`maxDepth` should be > 0 and <= max(`x`)', call. = FALSE)
}

# color comparisons require farver pkg for dE00
if(isColor & !requireNamespace('farver', quietly = TRUE)) {
stop('color comparison requires the `farver` package', call. =FALSE)

# for now, color comparisons are based on CIELAB color coordinates
# must be named "L", "A", "B" and in that order
if(any(vars != c('L', 'A', 'B'))) {
stop('CIELAB color coordinates must be specified as `L`, `A`, `B` in `vars`', call. =FALSE)
}
}




## TODO: reconsider hard-coded top depth
## truncate at maxDepth
x <- trunc(x, 0, maxDepth)



## variables used in NCSP algorithm

# number of variables
n.vars <- length(vars)

# compute a weighting vector based on k
# indexed to distance matrix list
w <- 1 * exp(-k * 1:maxDepth)

## split horizon / site vars
hn <- horizonNames(x)
sn <- siteNames(x)
h.vars <- intersect(hn, vars)
s.vars <- intersect(sn, vars)


## TODO: if there are any site data, extract those here
## and compute simple distance via cluster::daisy(, metric = 'gower')
## be sure to extract associated weights

## TODO: refactor old code here

# # check for any site data, remove and a save for later
# if(any(vars %in% sn)) {
#
# # extract site-level vars
# matching.idx <- na.omit(match(sn, vars))
# site.vars <- vars[matching.idx]
#
# # remove from hz-level vars
# vars <- vars[-matching.idx]
#
# ## TODO: BUG!!! horizon data are rescaled via D/max(D) !!!
# ## TODO: allow user to pass-in variable type information
# # compute dissimilarty on site-level data: only works with 2 or more variables
# # rescale to [0,1]
#
# message(paste('site-level variables included:', paste(site.vars, collapse=', ')))
# d.site <- daisy(s.site[, site.vars, drop=FALSE], metric='gower')
#
# # re-scale to [0,1]
# d.site <- .rescaleRange(d.site, x0 = 0, x1 = 1)
#
# # reset default behavior of hz-level D
# rescale.result=TRUE
#
# ## TODO: there might be cases where we get an NA in d.site ... seems like it happens with boolean variables
# ## ... but why ? read-up on daisy
# if(any(is.na(d.site))) {
# warning('NA in site-level dissimilarity matrix, replacing with min dissimilarity', call.=FALSE)
# d.site[which(is.na(d.site))] <- min(d.site, na.rm=TRUE)
# }
#
# ## TODO: ordering of D_hz vs D_site ... assumptions safe?
#
# } else {
# # setup a dummy D_site
# d.site <- NULL
# }



## TODO: expose LHS of formula to dice()
## NOTE: dice() LHS usually starts from 0, sliceSequence and soil.matrix are indexed from 1

## TODO: consider exposing depth logic subset options in arguments to NCSP
## for now, entire profiles are subset


## dice according to depth sequence and vars
# preserve SPC for access to all site data
# it could be better to perform sanity checks on horizonation outside of this function
.seq <- seq(from = 0, to = maxDepth - 1, by = 1)
.fm <- as.formula(
sprintf('c(%s) ~ %s',
paste0(.seq, collapse = ','),
paste0(h.vars, collapse = ' + '))
)

# reset removed.profiles metadata, it may have been set prior to calling NCSP()
metadata(x)$removed.profiles <- NULL

## dice
# pctMissing is used to develop soil/non-soil matrix
s <- suppressMessages(dice(x, fm = .fm, SPC = TRUE, fill = TRUE, byhz = FALSE, pctMissing = TRUE, strict = TRUE))

# number of profiles, accounting for subset via dice()
n.profiles <- length(s)
# profile IDs
.ids <- profile_id(s)

# keep track of removed profiles, due to hz logic errors
.removed.profiles <- metadata(s)$removed.profiles
if(length(nchar(.removed.profiles)) > 0) {
warning('hz depth logic subset has removed some profiles')
}

## slice sequence
sliceSequence <- 1:max(s)

## add soil flag, previously optional now required
# using same data structure / conventions as profile_compare()
# logical matrix [sliceSequence, 1:n.profiles]
# [slices, profiles]

## TODO: ensure that in-line NA are correctly handled
## TODO: in-line NA may require another argument to determine assumptions
## TODO: define soil / non-soil using pattern matching on horizon designation

# use a vector of horizon (slice) indices to extract all pctMissing values
# develop a matrix from these
soil.matrix <- matrix(
s[, sliceSequence][['.pctMissing']],
ncol = n.profiles,
nrow = length(sliceSequence),
byrow = FALSE
)

# "soil" has a pctMising very close to 0
soil.matrix <- soil.matrix < 0.00001
# keep track of profile IDs
dimnames(soil.matrix)[[2]] <- .ids


## evaluate distances by slice
## accounting for soil/non-soil comparisons
## filling NA due to missing data

## TODO: basic progress reporting
## TODO: cache identical slices
## TODO: if !returnDepthDistances: do not retain full list of dist mat, accumulate in single variable

message(paste('Computing dissimilarity matrices from', n.profiles, 'profiles'), appendLF = FALSE)

.sall <- as.data.table(horizons(s))[, .SD, .SDcols = vars]

# SPC j index is overkill; each profile is known to have same number of slices...
# .v <- vapply(sliceSequence, FUN.VALUE = numeric(length(s)), function(i) s[, i, .HZID])

# much more efficient indexing given assumption of equal number of slices/profile
.maxSlice <- max(sliceSequence)
.sliceStarts <- which(seq_len(nrow(.sall)) %% .maxSlice == 0) - .maxSlice
.v <- vapply(sliceSequence, FUN.VALUE = numeric(length(s)), function(i) .sliceStarts + i)

# create dissimilarity matrix for each slice
.ds <- data.table(.sliceID = sliceSequence)
.d <- .ds[, list(list(
.NCSP_distanceCalc(
m = .sall[.v[, .sliceID], ],
sm = soil.matrix[.sliceID,],
w = weights,
isColor = isColor
) * w[.sliceID])), by = ".sliceID"]$V1

## optionally return list of distance matrices
if(returnDepthDistances) {
return(.d)
}

# print total size of D
message(paste(" [", signif(object.size(.d) / 1024^2, 1), " Mb]", sep=''))

## flatten list of distance matrices
.d <- Reduce('+', .d)


## optionally normalize by dividing by max(D)
# this is important when incorporating site data
# TODO: causes problems for some functions like MASS::sammon() ?
if(rescaleResult) {
.d <- .d / max(.d, na.rm = TRUE)
}



## metadata

# distance metric
if(isColor) {
attr(.d, 'Distance Metric') <- 'CIE2000'
} else {
attr(.d, 'Distance Metric') <- 'Gower'
}

# removed profiles, if any
attr(.d, 'removed.profiles') <- .removed.profiles

# remove warnings about NA from cluster::daisy()
attr(.d, 'NA.message') <- NULL

# full set of profiles replaced with profile IDs stored in attr(.d, 'Labels')
attr(.d, 'Labels') <- .ids

# done
return(.d)

}


132 changes: 132 additions & 0 deletions misc/NCSP/optimize.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# NCSP() optimization

# converted for-loop into data.table call for slice-wise .NCSP_distanceCalc

# costly operations in .NCSP_distanceCalc
# - daisy() for each slice is the primary cost (~1/3 execution time)
# - outer() twice for each slice adds up
# - conversion from dist->matrix->dist is somewhat costly (requires 2+ copy; adds up)

# opportunities
# - can daisy/outer/outer/matrix conversion etc. be called only once--for all slices?

library(aqp)
#> This is aqp 2.0

# setup horizon-level data: data are from lab sampled pedons
d <- read.csv(
textConnection('series,top,bottom,clay,frags,ph
auburn,0,3,21,6,5.6
auburn,3,15,21,13,5.6
auburn,15,25,20,9,5.8
auburn,25,47,21,28,5.8
dunstone,0,5,16,13,6
dunstone,5,17,17,19,6.3
dunstone,17,31,20,6,6.3
dunstone,31,41,21,15,6.3
sobrante,0,5,18,0,5.8
sobrante,5,10,16,2,5.7
sobrante,10,28,15,21,5.8
sobrante,28,51,18,13,6.2
sobrante,51,74,20,12,6.2'))

# establish site-level data
s <- data.frame(
series = c('auburn', 'dunstone', 'sobrante'),
precip = c(24, 30, 32)
)

# generate fake horizon names with clay / frags / ph
d$name <- with(d, paste(clay, frags, ph, sep='/'))

# upgrade to SoilProfile Collection object
depths(d) <- series ~ top + bottom
site(d) <- s
hzdesgnname(d) <- 'name'


bench::mark(
memory = FALSE,
before = NCSP(
d,
vars = c('clay', 'ph', 'frags'),
k = 0,
maxDepth = 61
),
after = aqp:::.NCSP2(
d,
vars = c('clay', 'ph', 'frags'),
k = 0,
maxDepth = 61
)
)
#> Computing dissimilarity matrices from 3 profiles
#> [0.09 Mb]
#> Computing dissimilarity matrices from 3 profiles [0.09 Mb]
#> Computing dissimilarity matrices from 3 profiles [0.09 Mb]
#> Computing dissimilarity matrices from 3 profiles [0.09 Mb]
#> Computing dissimilarity matrices from 3 profiles [0.09 Mb]
#> Computing dissimilarity matrices from 3 profiles [0.09 Mb]
#> Computing dissimilarity matrices from 3 profiles [0.09 Mb]
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 before 517ms 517ms 1.93 NA 3.87
#> 2 after 114ms 131ms 7.78 NA 3.89

set.seed(123)
x <- aqp::combine(lapply(1:100, random_profile, SPC = TRUE))
bench::mark(
memory = FALSE,
before = NCSP(
x,
vars = c('p1', 'p2', 'p3', 'p4', 'p5'),
k = 0,
maxDepth = 143
),
after = aqp:::.NCSP2(
x,
vars = c('p1', 'p2', 'p3', 'p4', 'p5'),
k = 0,
maxDepth = 143
)
)
#> Computing dissimilarity matrices from 100 profiles [6 Mb]
#> Computing dissimilarity matrices from 100 profiles [6 Mb]
#> Computing dissimilarity matrices from 100 profiles [6 Mb]
#> Computing dissimilarity matrices from 100 profiles [6 Mb]
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 before 2.52s 2.52s 0.397 NA 18.3
#> 2 after 1s 1s 0.999 NA 14.0

set.seed(123)
x <- aqp::combine(lapply(1:1000, random_profile, SPC = TRUE))
bench::mark(
memory = FALSE,
before = NCSP(
x,
vars = c('p1', 'p2', 'p3', 'p4', 'p5'),
k = 0,
maxDepth = 177
),
after = aqp:::.NCSP2(
x,
vars = c('p1', 'p2', 'p3', 'p4', 'p5'),
k = 0,
maxDepth = 177
)
)
#> Computing dissimilarity matrices from 1000 profiles [700 Mb]
#> Computing dissimilarity matrices from 1000 profiles [700 Mb]
#> Computing dissimilarity matrices from 1000 profiles [700 Mb]
#> Computing dissimilarity matrices from 1000 profiles [700 Mb]
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 before 52.5s 52.5s 0.0191 NA 3.03
#> 2 after 40.9s 40.9s 0.0244 NA 3.86