Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
109 changes: 65 additions & 44 deletions R/parseOSD_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# l[['sections']] <- .extractSections(res)
# l[['section-indices']] <- .findSectionIndices(res)

message(filename)
l <- list()
l[['site-data']] <- .extractSiteData(x, logfile, filename)
tp <- strsplit(as.character(x$`TYPICAL PEDON`$content), "\n")
Expand Down Expand Up @@ -303,28 +304,27 @@

#' @importFrom stringi stri_match_all
.extractHzData <- function(tp, logfile = "OSD.log", filename = "FOO.txt") {

# detect horizons with both top and bottom depths
hz.rule <- "([\\^\\'\\/a-zA-Z0-9]+(?: and [\\^\\'\\/a-zA-Z0-9]+)?)\\s*[-=\u2014]+\\s*([Ol0-9.]+)\\s*?(to|-)?\\s+?([Ol0-9.]+)\\s*?(in|inches|cm|centimeters)"
hz.rule <- "([\\^\\'\\\"`\\/a-zA-Z0-9]+(?: and [\\^\\'\\\"`\\/a-zA-Z0-9]+)?)\\s*[-=\u2014]+\\s*([Ol0-9.]+)\\s*?([toTO0\\-]+)?\\s+?([Ol0-9.]+)\\s*?(inche?s?|in|cm|centimeters?)"

# detect horizons with no bottom depth
hz.rule.no.bottom <- "([\\^\\'\\/a-zA-Z0-9]+(?: and [\\^\\'\\/a-zA-Z0-9]+)?)\\s*[-=\u2014]+?\\s*([Ol0-9.]+)\\s*(to|-)?\\s*([Ol0-9.]+)?\\s*?(in|inches|cm|centimeters)"
hz.rule.no.bottom <- "([\\^\\'\\\"`\\/a-zA-Z0-9]+(?: and [\\^\\'\\\"`\\/a-zA-Z0-9]+)?)\\s*[-=\u2014]+?\\s*([Ol0-9./ ]+)\\s*(inche?s?|in|cm|centimeters?)?\\s*([toTO0\\-]+)?\\s*([Ol0-9./ ]+)?\\s*(inche?s?|in|cm|centimeters?)?"

## default encoding of colors: Toggle dry/moist assumption
##
## Profile-level statement: Colors are for dry soil unless otherwise stated | Colors are for moist soil unless otherwise stated
##
##
## Examples:
## moist:
## E1--7 to 12 inches; very dark gray (10YR 3/1) silt loam, 50 percent gray (10YR 5/1) and 50 percent gray (10YR 6/1) dry; moderate thin platy structure parting to weak thin platy; friable, soft; common fine and medium roots throughout; common fine tubular pores; few fine distinct dark yellowish brown (10YR 4/6) friable masses of iron accumulations with sharp boundaries on faces of peds; strongly acid; clear wavy boundary.
##
##
## dry:
## A--0 to 6 inches; light gray (10YR 7/2) loam, dark grayish brown (10YR 4/2) moist; moderate coarse subangular blocky structure; slightly hard, friable, slightly sticky and slightly plastic; many very fine roots; many very fine and few fine tubular and many very fine interstitial pores; 10 percent pebbles; strongly acid (pH 5.1); clear wavy boundary. (1 to 8 inches thick)
##
dry.is.default <- length(grep('for[ athe]+(?:air-* *)?dr[yied]+[ \\n,]+(colors|soil|conditions)', tp, ignore.case = TRUE)) > 0
moist.is.default <- length(grep('for[ athe]+(wet|moi*st)[ \\n,]+(rubbed|crushed|broken|interior|soil|conditions)', tp, ignore.case = TRUE)) > 0


if (dry.is.default)
default.moisture.state <- 'dry'
if (moist.is.default)
Expand All @@ -347,7 +347,7 @@

# eliminate empty lines within typical pedon
tp <- tp[nzchar(trimws(tp))]

# ID starting lines of horizon information
hz.idx <- sort(unique(c(grep(hz.rule, tp), grep(hz.rule.no.bottom, tp))))

Expand All @@ -356,7 +356,7 @@
if (length(first.line.flag) > 0) {
hz.idx <- hz.idx[-first.line.flag]
}

check.multiline <- diff(hz.idx) > 1
if (any(check.multiline)) {
# multiline typical pedon horizon formatting (needs fix)
Expand All @@ -380,45 +380,64 @@
# if none, then try searching for only top depths
if (all(is.na(h))) {
# this won't have the correct number of elements, adjust manually
h <- stringi::stri_match(this.chunk, regex = hz.rule.no.bottom)
h_num <- grep("^\\d+$", h)
h_alp <- grep("[A-Za-z]", h)[2:3]
h <- h[sort(c(h_num, h_alp))]

h <- trimws(stringi::stri_match(this.chunk, regex = hz.rule.no.bottom))
h[2] <- gsub("0", "O", h[2], fixed=TRUE)
h[6] <- gsub("l", "1", h[6], fixed=TRUE)
h <- gsub(" *3/4", ".75", h)
h <- gsub(" *[1l]/2", ".5", h)
h <- gsub(" *[1l]/[48]", ".25", h) # NB: fudging 1/8 inch -> 1 cm
h <- gsub("^\\.", "0.", h)
h <- gsub("l", "1", h)
i_num <- grep("^\\d+\\.*\\d*$", h)
# fill missing depth with NA
if (length(h) == 3) {
h <- c(h, h[3])
h[3] <- NA
if (length(i_num) == 1) {
i_num <- c(i_num, NA)
}
h_num <- h[i_num]
l_alp <- grepl("[A-Za-z]", h)
h_alp <- h[l_alp & h != "to" & h != "-"][2:3]
h <- c(h_alp[1], h_num, h_alp[2])

} else {
h[2] <- gsub("0", "O", h[2], fixed=TRUE)
h[c(3,5)] <- gsub("l", "1", h[c(3,5)], fixed=TRUE)
h <- h[c(2:3,5:6)]
}

# save hz data to list
hz.data[[i]] <- h

# save narrative to list
narrative.data[[i]] <- this.chunk

## TODO: test this!
# parse ALL colors, result is a multi-row matrix, 5th column is moisture state
colors <- stringi::stri_match_all(this.chunk, regex = color.rule)[[1]]
# apply a filter so horizon data with no horizon designation skip
if (!is.na(h[1]) && grepl("[OABCDELMRVWbcxw]", h[1])) {
# save hz data to list
hz.data[[i]] <- h

# save narrative to list
narrative.data[[i]] <- this.chunk

## TODO: test this!
# parse ALL colors, result is a multi-row matrix, 5th column is moisture state
colors <- stringi::stri_match_all(this.chunk, regex = color.rule)[[1]]

# replace missing moisture state with (parsed) default value
colors[, 5][which(colors[, 5] == '')] <- default.moisture.state

# extract dry|moist colors, note that there may be >1 color per state
dc <- colors[which(colors[, 5] == 'dry'), 1:4, drop = FALSE]
mc <- colors[which(colors[, 5] == 'moist'), 1:4, drop = FALSE]

# there there was at least 1 match, keep the first 1
if (nrow(dc) > 0) {
dry.colors[[i]] <- dc[1, ]
} else dry.colors[[i]] <- matrix(rep(NA, times = 4), nrow = 1)

if (nrow(mc) > 0)
moist.colors[[i]] <- mc[1, ]
else moist.colors[[i]] <- matrix(rep(NA, times = 4), nrow = 1)
} else {
hz.data[[i]] <- NULL
narrative.data[[i]] <- NULL
dry.colors[[i]] <- NULL
moist.colors[[i]] <- NULL
}

# replace missing moisture state with (parsed) default value
colors[, 5][which(colors[, 5] == '')] <- default.moisture.state

# extract dry|moist colors, note that there may be >1 color per state
dc <- colors[which(colors[, 5] == 'dry'), 1:4, drop = FALSE]
mc <- colors[which(colors[, 5] == 'moist'), 1:4, drop = FALSE]

# there there was at least 1 match, keep the first 1
if (nrow(dc) > 0) {
dry.colors[[i]] <- dc[1, ]
} else dry.colors[[i]] <- matrix(rep(NA, times = 4), nrow = 1)

if (nrow(mc) > 0)
moist.colors[[i]] <- mc[1, ]
else moist.colors[[i]] <- matrix(rep(NA, times = 4), nrow = 1)
}

# test for no parsed data, must be some funky formatting...
Expand All @@ -427,6 +446,8 @@

# convert to DF
hz.data <- as.data.frame(do.call('rbind', hz.data))
if (ncol(hz.data) != 4)
return(NULL)
dry.colors <- as.data.frame(do.call('rbind', dry.colors))[2:4]
moist.colors <- as.data.frame(do.call('rbind', moist.colors))[2:4]
narrative.data <- as.data.frame(do.call('rbind', narrative.data))
Expand All @@ -448,9 +469,9 @@
moist.colors$moist_chroma <- as.numeric(moist.colors$moist_chroma)
})

## TODO: sanity check / unit reporting: this will fail when formatting is inconsistent (PROPER series)
# convert in -> cm using the first horizon
if (hz.data$units[1] %in% c('inches', 'in')) {
if (!is.na(hz.data$units[1]) &&
startsWith(tolower(hz.data$units[1]), "in")) {
hz.data$top <- round(hz.data$top * 2.54)
hz.data$bottom <- round(hz.data$bottom * 2.54)
}
Expand Down
41 changes: 3 additions & 38 deletions inst/extdata/OSD-error-reporting/RO/auburn-hz.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
"ACKWATER","Bt2",23,41,NA,NA,NA,"lOYR","5","8"
"ALAMUCHEE","Bwl",13,33,NA,NA,NA,"10YR","4","4"
"ALDERFLATS","Btgl",13,30,NA,NA,NA,"5Y","5","2"
"ANDRY","022",15,0,NA,NA,NA,"10YR","3","2"
"ANDRY","021",30,15,NA,NA,NA,"10YR","3","2"
"ANDRY","O22",15,0,NA,NA,NA,"10YR","3","2"
"ANDRY","O21",30,15,NA,NA,NA,"10YR","3","2"
"ASHLAR","A",5,13,NA,NA,NA,"l0YR","3","2"
"ASHLAR","E",13,28,NA,NA,NA,"l0YR","5","2"
"ASHLAR","Bw",28,51,NA,NA,NA,"l0YR","5","4"
Expand All @@ -14,78 +14,45 @@
"BELHAVEN","Oal",23,33,NA,NA,NA,"5YR","2.5","2"
"BENNDALE","Btl",13,28,NA,NA,NA,"10YR","5","6"
"BLICHTON","Btvgl",76,114,NA,NA,NA,"N","4","0"
"BODINE","0e",0,3,NA,NA,NA,"10YR","2","2"
"BOURNE","0i",0,3,NA,NA,NA,NA,NA,NA
"BRIGHTON","Oap",0,30,NA,NA,NA,"l0YR","2","1"
"BROCKROAD","01",0,5,NA,NA,NA,NA,NA,NA
"BURROWSVILLE","A",0,8,NA,NA,NA,"lOYR","5","2"
"BURROWSVILLE","E",8,36,NA,NA,NA,"lOYR","6","4"
"BURROWSVILLE","Bt",36,64,NA,NA,NA,"lOYR","6","4"
"BURROWSVILLE","Btx",64,97,NA,NA,NA,"lOYR","5","6"
"CAROLINE","0i",0,5,NA,NA,NA,NA,NA,NA
"CHEAHA","0i",0,3,NA,NA,NA,NA,NA,NA
"CHRISTIAN","Btl",15,38,NA,NA,NA,"5YR","5","8"
"COLFAX","0i",3,3,NA,NA,NA,NA,NA,NA
"DEMORY","0a",0,8,NA,NA,NA,"10YR","2","1"
"COLFAX","Oi",3,0,NA,NA,NA,NA,NA,NA
"EATON","Btgl",76,84,NA,NA,NA,"10YR","5","1"
"ELLOREE","Btgl",58,69,NA,NA,NA,"10YR","5","2"
"ELLOREE","BCgl",107,135,NA,NA,NA,"10YR","6","1"
"EUREKA","Btgl",28,51,NA,NA,NA,"N","5","0"
"FORK","0i",0,3,NA,NA,NA,NA,NA,NA
"FORK","0e",3,5,NA,NA,NA,"10YR","3","2"
"FORTUNA","B2lg",13,23,NA,NA,NA,"5GY","4","1"
"FREELAND","Btl",33,51,NA,NA,NA,"10YR","4","4"
"GRIFFITH","Al",25,56,NA,NA,NA,"5Y","3","1"
"GRITNEY","Btl",23,43,NA,NA,NA,"10YR","5","6"
"KEYESPOINT","Bgl",18,46,NA,NA,NA,"10YR","4","2"
"LAROQUE","0i",0,3,NA,NA,NA,NA,NA,NA
"LEDWITH","0a",0,23,NA,NA,NA,"10YR","3","3"
"MATTAPONI","0i",0,3,NA,NA,NA,NA,NA,NA
"MCLAURIN","Btl",36,51,NA,NA,NA,"5YR","5","6"
"MONTROSS","0i",0,5,NA,NA,NA,NA,NA,NA
"NATCHEZ","0i",0,3,NA,NA,NA,NA,NA,NA
"NEWBERN","Ap",0,13,NA,NA,NA,"1OYR","5","4"
"NEWBERN","Bw",13,33,NA,NA,NA,"1OYR","6","6"
"NEWBERN","Cr",33,46,NA,NA,NA,"1OYR","6","8"
"NEWFLAT","0i",0,3,NA,NA,NA,NA,NA,NA
"NEWHAN","A",0,5,NA,NA,NA,"l0YR","5","2"
"NEWHAN","C1",5,127,NA,NA,NA,"l0YR","7","2"
"NEWHAN","C2",127,183,NA,NA,NA,"l0YR","7","2"
"OAKLIMETER","Bwl",28,51,NA,NA,NA,"10YR","5","4"
"OKEECHOBEE","0a",20,71,NA,NA,NA,"5YR","2","1"
"OPENLAKE","Bgl",18,33,NA,NA,NA,"10YR","4","2"
"ORENDA","0i",0,3,NA,NA,NA,NA,NA,NA
"PAILO","A",3,8,NA,NA,NA,"l0YR","3","2"
"PAILO","BE",8,43,NA,NA,NA,"l0YR","5","6"
"PAILO","Btl",43,89,NA,NA,NA,"l0YR","5","6"
"PAMUNKEY","2C",117,203,NA,NA,NA,"1OYR","5","6"
"PARTLOW","Btgl",38,53,NA,NA,NA,"10YR","5","1"
"PETAL","Btl",20,43,NA,NA,NA,"5YR","5","8"
"POCATY","0i",0,30,NA,NA,NA,"10YR","2","2"
"POCATY","0e",30,51,NA,NA,NA,"10YR","2","2"
"POCATY","0a1",51,104,NA,NA,NA,"10YR","2","1"
"POCATY","0a2",104,122,NA,NA,NA,"10YR","2","1"
"POCATY","0a3",122,152,NA,NA,NA,"10YR","4","1"
"PUNTA","A",0,10,NA,NA,NA,"l0YR","4",NA
"PUNTA","E1",10,28,NA,NA,NA,"l0YR","6","2"
"PUNTA","E2",28,145,NA,NA,NA,"l0YR","8",NA
"RAPPAHANNOCK","0a1",0,41,NA,NA,NA,"10YR","3","2"
"RAPPAHANNOCK","0a2",41,76,NA,NA,NA,"10YR","3","1"
"RAPPAHANNOCK","0a3",76,104,NA,NA,NA,"10YR","2","2"
"RAPPAHANNOCK","0'a",160,190,NA,NA,NA,"10YR","2","1"
"RED HILLS","Oi",3,0,NA,NA,NA,NA,NA,NA
"REMLIK","0i",0,5,NA,NA,NA,NA,NA,NA
"REPARADA","110C",46,15,NA,NA,NA,"10YR","2","1"
"ROME","Btl",23,51,NA,NA,NA,"7.5YR","5","6"
"SEKIL","0i",0,3,NA,NA,NA,NA,NA,NA
"SEKIL","0e",3,5,NA,NA,NA,"10YR","2","1"
"SOCO","0i",0,5,NA,NA,NA,NA,NA,NA
"STECOAH","0i",0,3,NA,NA,NA,NA,NA,NA
"SURRENCY","0i",0,3,NA,NA,NA,NA,NA,NA
"SWAFFORD","Btl",20,30,NA,NA,NA,"7.5YR","4","6"
"TALANTE","Alg",10,25,NA,NA,NA,"10YR","5","2"
"TALANTE","Blg",25,46,NA,NA,NA,"10YR","5","3"
"TALLADEGA","0i",0,5,NA,NA,NA,NA,NA,NA
"TRENHOLM","Btl",30,51,NA,NA,NA,"10YR","5","6"
"TRENHOLM","Cl",91,114,NA,NA,NA,"10YR","5","8"
"TRENHOLM","A",NA,5,NA,NA,NA,"10YR","3","1"
Expand All @@ -96,6 +63,4 @@
"VICKSBURG","Cl",18,71,NA,NA,NA,"10YR","4","3"
"WANDO","Cl",20,89,NA,NA,NA,"7.5YR","5","8"
"WEEKSVILLE","Cgl",114,152,NA,NA,NA,"10YR","6","1"
"WESTWEGO","II02bg",53,91,NA,NA,NA,"N","2","0"
"YULEE","0i",0,5,NA,NA,NA,NA,NA,NA
"ZUBER","Btl",38,51,NA,NA,NA,"10YR","4","4"
1 change: 0 additions & 1 deletion inst/extdata/OSD-error-reporting/RO/auburn-sections.csv
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
"ANDRY","MULTILINE TYPICAL PEDON","[number of multilines=6]","auburn, al"
"ANNISTON","MULTILINE TYPICAL PEDON","[number of multilines=4]","auburn, al"
"BASSFIELD","MULTILINE TYPICAL PEDON","[number of multilines=5]","auburn, al"
"BEARKNOB","MULTILINE TYPICAL PEDON","[number of multilines=1]","auburn, al"
"BOWMANTOWN","MULTILINE TYPICAL PEDON","[number of multilines=1]","auburn, al"
"BREWTON","MULTILINE TYPICAL PEDON","[number of multilines=1]","auburn, al"
"BUGLEY","DUPLICATION OF HEADERS","[RANGE IN CHARACTERISTICS:]","auburn, al"
Expand Down
34 changes: 4 additions & 30 deletions inst/extdata/OSD-error-reporting/RO/auburn-series.csv
Original file line number Diff line number Diff line change
@@ -1,71 +1,45 @@
"id","ac","benchmarksoilflag","soiltaxclasslastupdated","depthErrors","ocrErrors"
"BODINE",1152290,TRUE,2009,FALSE,TRUE
"SURRENCY",773555,FALSE,2021,FALSE,TRUE
"MCLAURIN",564282,FALSE,1997,FALSE,TRUE
"BENNDALE",408250,FALSE,2012,FALSE,TRUE
"OAKLIMETER",364788,FALSE,2005,FALSE,TRUE
"CHRISTIAN",297423,FALSE,2001,FALSE,TRUE
"NATCHEZ",227417,FALSE,1998,FALSE,TRUE
"BELHAVEN",213210,FALSE,2002,FALSE,TRUE
"SOCO",189881,FALSE,2001,FALSE,TRUE
"TALLADEGA",155342,FALSE,2001,FALSE,TRUE
"GRITNEY",129163,FALSE,1997,FALSE,TRUE
"ASHLAR",107187,TRUE,2017,FALSE,TRUE
"VICKSBURG",103571,FALSE,2002,FALSE,TRUE
"REMLIK",103503,FALSE,2002,FALSE,TRUE
"STECOAH",95927,FALSE,2001,FALSE,TRUE
"MATTAPONI",92795,FALSE,2003,FALSE,TRUE
"CAROLINE",77934,FALSE,2002,FALSE,TRUE
"PAILO",77493,FALSE,2002,FALSE,TRUE
"PETAL",74698,FALSE,2003,FALSE,TRUE
"NEWBERN",73692,FALSE,1998,FALSE,TRUE
"BLICHTON",71250,FALSE,2002,FALSE,TRUE
"COLFAX",58035,FALSE,1997,TRUE,TRUE
"CHEAHA",53744,FALSE,1999,FALSE,TRUE
"COLFAX",58035,FALSE,1997,TRUE,FALSE
"NEWHAN",51040,FALSE,2003,FALSE,TRUE
"BOURNE",45410,FALSE,2002,FALSE,TRUE
"FREELAND",44420,FALSE,2002,FALSE,TRUE
"FREELAND",44420,FALSE,2002,TRUE,TRUE
"GRIFFITH",39628,FALSE,2004,FALSE,TRUE
"OPENLAKE",31864,FALSE,2004,FALSE,TRUE
"DEMORY",29488,FALSE,2000,FALSE,TRUE
"EUREKA",29370,FALSE,2002,FALSE,TRUE
"OKEECHOBEE",29101,FALSE,2000,FALSE,TRUE
"ELLOREE",28778,FALSE,1997,FALSE,TRUE
"PARTLOW",26416,FALSE,1999,FALSE,TRUE
"KEYESPOINT",24448,FALSE,2002,FALSE,TRUE
"WESTWEGO",23971,FALSE,2011,FALSE,TRUE
"PAMUNKEY",23728,FALSE,2002,FALSE,TRUE
"BEECH GROVE",23364,FALSE,2002,TRUE,FALSE
"WANDO",21363,FALSE,2002,FALSE,TRUE
"ROME",20963,FALSE,2003,FALSE,TRUE
"ACKWATER",20910,FALSE,1997,FALSE,TRUE
"MONTROSS",19220,FALSE,2007,FALSE,TRUE
"EATON",19201,FALSE,2002,FALSE,TRUE
"BRIGHTON",17281,FALSE,2000,FALSE,TRUE
"RAPPAHANNOCK",16907,FALSE,2003,FALSE,TRUE
"WEEKSVILLE",16893,FALSE,2006,FALSE,TRUE
"ORENDA",14755,FALSE,2008,FALSE,TRUE
"SWAFFORD",12895,FALSE,2001,FALSE,TRUE
"NEWFLAT",12053,FALSE,1997,FALSE,TRUE
"YULEE",8230,FALSE,1998,FALSE,TRUE
"FORK",8078,FALSE,1997,FALSE,TRUE
"ANDRY",7778,FALSE,2003,TRUE,TRUE
"ANDRY",7778,FALSE,2003,TRUE,FALSE
"ZUBER",7641,FALSE,2003,FALSE,TRUE
"POCATY",7379,FALSE,2003,FALSE,TRUE
"SEKIL",6120,FALSE,2002,FALSE,TRUE
"LEDWITH",5574,FALSE,2003,FALSE,TRUE
"TALANTE",5463,FALSE,2002,FALSE,TRUE
"BROCKROAD",5373,FALSE,1999,FALSE,TRUE
"RED HILLS",4713,FALSE,2006,TRUE,FALSE
"TRENHOLM",4019,FALSE,2002,TRUE,TRUE
"PUNTA",3805,FALSE,1993,FALSE,TRUE
"TUCKAHOE",3244,FALSE,1999,FALSE,TRUE
"BURROWSVILLE",3176,FALSE,1997,FALSE,TRUE
"FORTUNA",3050,FALSE,2002,FALSE,TRUE
"LAROQUE",2387,FALSE,1999,FALSE,TRUE
"ALAMUCHEE",744,FALSE,2000,FALSE,TRUE
"BAHIAHONDA",672,FALSE,2009,FALSE,TRUE
"MAYO",649,FALSE,2001,TRUE,FALSE
"ALDERFLATS",529,FALSE,2006,FALSE,TRUE
"REPARADA",351,FALSE,2002,TRUE,TRUE
"BEARKNOB",189,FALSE,2013,TRUE,FALSE
"VERO",NA,FALSE,2004,FALSE,TRUE
Loading