|
303 | 303 |
|
304 | 304 | #' @importFrom stringi stri_match_all
|
305 | 305 | .extractHzData <- function(tp, logfile = "OSD.log", filename = "FOO.txt") {
|
306 |
| - |
| 306 | + |
307 | 307 | # detect horizons with both top and bottom depths
|
308 |
| - hz.rule <- "([\\^\\'\\/a-zA-Z0-9]+(?: and [\\^\\'\\/a-zA-Z0-9]+)?)\\s*[-=\u2014]+\\s*([Ol0-9.]+)\\s*?(to|-)?\\s+?([Ol0-9.]+)\\s*?(in|inches|cm|centimeters)" |
309 |
| - |
| 308 | + hz.rule <- "([\\^\\'\\/a-zA-Z0-9]+(?: and [\\^\\'\\/a-zA-Z0-9]+)?)\\s*[-=\u2014]+\\s*([Ol0-9.]+)\\s*?(to|-)?\\s+?([Ol0-9.]+)\\s*?(inches|in|cm|centimeters)" |
| 309 | + |
310 | 310 | # detect horizons with no bottom depth
|
311 |
| - hz.rule.no.bottom <- "([\\^\\'\\/a-zA-Z0-9]+(?: and [\\^\\'\\/a-zA-Z0-9]+)?)\\s*[-=\u2014]+?\\s*([Ol0-9.]+)\\s*(to|-)?\\s*([Ol0-9.]+)?\\s*?(in|inches|cm|centimeters)" |
312 |
| - |
| 311 | + hz.rule.no.bottom <- "([\\^\\'\\/a-zA-Z0-9]+(?: and [\\^\\'\\/a-zA-Z0-9]+)?)\\s*[-=—]+?\\s*([Ol0-9./ ]+)\\s*(inche?s?|in|cm|centimeters)?\\s*(to|-)?\\s*([Ol0-9./ ]+)?\\s*(inche?s?|in|cm|centimeters)?" |
| 312 | + |
313 | 313 | ## default encoding of colors: Toggle dry/moist assumption
|
314 | 314 | ##
|
315 | 315 | ## Profile-level statement: Colors are for dry soil unless otherwise stated | Colors are for moist soil unless otherwise stated
|
316 |
| - ## |
| 316 | + ## |
317 | 317 | ## Examples:
|
318 | 318 | ## moist:
|
319 | 319 | ## E1--7 to 12 inches; very dark gray (10YR 3/1) silt loam, 50 percent gray (10YR 5/1) and 50 percent gray (10YR 6/1) dry; moderate thin platy structure parting to weak thin platy; friable, soft; common fine and medium roots throughout; common fine tubular pores; few fine distinct dark yellowish brown (10YR 4/6) friable masses of iron accumulations with sharp boundaries on faces of peds; strongly acid; clear wavy boundary.
|
320 |
| - ## |
| 320 | + ## |
321 | 321 | ## dry:
|
322 | 322 | ## A--0 to 6 inches; light gray (10YR 7/2) loam, dark grayish brown (10YR 4/2) moist; moderate coarse subangular blocky structure; slightly hard, friable, slightly sticky and slightly plastic; many very fine roots; many very fine and few fine tubular and many very fine interstitial pores; 10 percent pebbles; strongly acid (pH 5.1); clear wavy boundary. (1 to 8 inches thick)
|
323 | 323 | ##
|
324 | 324 | dry.is.default <- length(grep('for[ athe]+(?:air-* *)?dr[yied]+[ \\n,]+(colors|soil|conditions)', tp, ignore.case = TRUE)) > 0
|
325 | 325 | moist.is.default <- length(grep('for[ athe]+(wet|moi*st)[ \\n,]+(rubbed|crushed|broken|interior|soil|conditions)', tp, ignore.case = TRUE)) > 0
|
326 | 326 |
|
327 |
| - |
328 | 327 | if (dry.is.default)
|
329 | 328 | default.moisture.state <- 'dry'
|
330 | 329 | if (moist.is.default)
|
|
347 | 346 |
|
348 | 347 | # eliminate empty lines within typical pedon
|
349 | 348 | tp <- tp[nzchar(trimws(tp))]
|
350 |
| - |
| 349 | + |
351 | 350 | # ID starting lines of horizon information
|
352 | 351 | hz.idx <- sort(unique(c(grep(hz.rule, tp), grep(hz.rule.no.bottom, tp))))
|
353 | 352 |
|
|
356 | 355 | if (length(first.line.flag) > 0) {
|
357 | 356 | hz.idx <- hz.idx[-first.line.flag]
|
358 | 357 | }
|
359 |
| - |
| 358 | + |
360 | 359 | check.multiline <- diff(hz.idx) > 1
|
361 | 360 | if (any(check.multiline)) {
|
362 | 361 | # multiline typical pedon horizon formatting (needs fix)
|
|
380 | 379 | # if none, then try searching for only top depths
|
381 | 380 | if (all(is.na(h))) {
|
382 | 381 | # this won't have the correct number of elements, adjust manually
|
383 |
| - h <- stringi::stri_match(this.chunk, regex = hz.rule.no.bottom) |
384 |
| - h_num <- grep("^\\d+$", h) |
385 |
| - h_alp <- grep("[A-Za-z]", h)[2:3] |
386 |
| - h <- h[sort(c(h_num, h_alp))] |
387 |
| - |
| 382 | + h <- trimws(stringi::stri_match(this.chunk, regex = hz.rule.no.bottom)) |
| 383 | + h[2] <- gsub("0", "O", h[2], fixed=TRUE) |
| 384 | + h[6] <- gsub("l", "1", h[6], fixed=TRUE) |
| 385 | + h <- gsub(" *[1l]/2", ".5", h) |
| 386 | + h <- gsub(" *[1l]/[48]", ".25", h) # NB: fudging 1/8 inch -> 1 cm |
| 387 | + h <- gsub("^\\.", "0.", h) |
| 388 | + h_num <- grep("^\\d+\\.*\\d*$", h) |
388 | 389 | # fill missing depth with NA
|
389 |
| - if (length(h) == 3) { |
390 |
| - h <- c(h, h[3]) |
391 |
| - h[3] <- NA |
| 390 | + if (length(h_num) == 1) { |
| 391 | + h_num <- c(h_num, NA) |
392 | 392 | }
|
| 393 | + h_alp <- grep("[A-Za-z]", h)[2:3] |
| 394 | + h <- h[c(h_alp[1], h_num, h_alp[2])] |
| 395 | + |
393 | 396 | } else {
|
| 397 | + h[2] <- gsub("0", "O", h[2], fixed=TRUE) |
| 398 | + h[c(3,5)] <- gsub("l", "1", h[c(3,5)], fixed=TRUE) |
394 | 399 | h <- h[c(2:3,5:6)]
|
395 | 400 | }
|
396 | 401 |
|
|
403 | 408 | ## TODO: test this!
|
404 | 409 | # parse ALL colors, result is a multi-row matrix, 5th column is moisture state
|
405 | 410 | colors <- stringi::stri_match_all(this.chunk, regex = color.rule)[[1]]
|
406 |
| - |
| 411 | + |
407 | 412 | # replace missing moisture state with (parsed) default value
|
408 | 413 | colors[, 5][which(colors[, 5] == '')] <- default.moisture.state
|
409 | 414 |
|
|
448 | 453 | moist.colors$moist_chroma <- as.numeric(moist.colors$moist_chroma)
|
449 | 454 | })
|
450 | 455 |
|
451 |
| - ## TODO: sanity check / unit reporting: this will fail when formatting is inconsistent (PROPER series) |
452 | 456 | # convert in -> cm using the first horizon
|
453 |
| - if (hz.data$units[1] %in% c('inches', 'in')) { |
| 457 | + if (hz.data$units[1] %in% c('inches', 'in', 'inch')) { |
454 | 458 | hz.data$top <- round(hz.data$top * 2.54)
|
455 | 459 | hz.data$bottom <- round(hz.data$bottom * 2.54)
|
456 | 460 | }
|
|
0 commit comments