Substring first n elements
df <- data.frame(a = NA, b = c(1:3, NA), c = 0)
drop_nas(df)
#> b c
#> 1 1 0
#> 2 2 0
#> 3 3 0
#> 4 NA 0
drop_zeros(df)
#> a b
#> 1 NA 1
#> 2 NA 2
#> 3 NA 3
#> 4 NA NA
clean directory/file names
Clean file names from a directory using
clean_dir_name()
f <- system.file("uon-trial-1.csv", package = "pbwrangler")
clean_dir_name(f)
#> [1] "uon-trial-1"
List files
List files in a directory using list_files()
list_files(system.file(package = "pbwrangler"))[1:5]
#> [1] "C:/Users/BasilOkola/AppData/Local/R/win-library/4.3/pbwrangler/meta-data.xlsx"
#> [2] "C:/Users/BasilOkola/AppData/Local/R/win-library/4.3/pbwrangler/Pedigree_advanced_clones_SSA.xlsx"
#> [3] "C:/Users/BasilOkola/AppData/Local/R/win-library/4.3/pbwrangler/uon-trial-1.csv"
#> [4] "C:/Users/BasilOkola/AppData/Local/R/win-library/4.3/pbwrangler/uon-trial-1.xlsx"
#> [5] "C:/Users/BasilOkola/AppData/Local/R/win-library/4.3/pbwrangler/year-cross.csv"
Compare dataframes to find shared columns
If you suspect two fieldbooks to be similar, you can compare them.
f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir=NULL, file_to_read = f)[[1]]
f1 <- system.file("uon-trial-1.xlsx", package = "pbwrangler")
df1 <- read_workbooks(dir=NULL, file_to_read = f, sheet_name = "Sheet1")[[1]]
compare_df(df, df1)
#> dim_X dim_Y design_X design_Y cols_not_X cols_not_Y
#> 1 108,32 108,32 6*18 6*18
Merge notes/obs to one column
f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir=NULL, file_to_read = f)[[1]]
merge_note_obs(df)[1:5, "obs"]
#> [1] "long oblong,,good yield" "very good yield"
#> [3] "buds, pinkish eyes" "good yield"
#> [5] NA
Create a meta-data file for each experiment designed during a season
Create a meta-data to accompany fieldbooks.
f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f)
create_meta_file(df, season = "season-2024", d_dir = tempdir())
#> $`season-2024`
#> # A tibble: 1 × 17
#> trial_name breeding_program location year transplanting_date design_type
#> <lgl> <lgl> <chr> <chr> <lgl> <lgl>
#> 1 NA NA NA 2024 NA NA
#> # ℹ 11 more variables: description <lgl>, trial_type <lgl>, plot_width <lgl>,
#> # plot_length <lgl>, planting_date <date>, harvest_date <date>,
#> # number_of_plants_per_ridge <lgl>, number_of_ridges_per_plot <lgl>,
#> # space_between_ridges <lgl>, space_between_plants_in_ridges <lgl>,
#> # number_of_plants_per_plot <lgl>
Capture trial location
You could try capture trial location from file names:
f <- system.file("uon-trial-1.xlsx", package = "pbwrangler")
f1 <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f, sheet_name = "Sheet 1")
df1 <- read_workbooks(dir = NULL, file_to_read = f1)
dat <- c(df, df1)
# this has a loc variable
dat_loc <- capture_location(dat)
lapply(dat_loc, function(x) x$loc[1:5])
#> $`UON-TRIAL-1`
#> [1] "UON-TRIAL-1" "UON-TRIAL-1" "UON-TRIAL-1" "UON-TRIAL-1" "UON-TRIAL-1"
#>
#> $`UON-TRIAL-1`
#> [1] "UON-TRIAL-1" "UON-TRIAL-1" "UON-TRIAL-1" "UON-TRIAL-1" "UON-TRIAL-1"
write data
You can write trial data using write_data()
:
f <- system.file("uon-trial-1.xlsx", package = "pbwrangler")
f1 <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f, sheet_name = "Sheet 1")
df1 <- read_workbooks(dir = NULL, file_to_read = f1)
dat <- c(df, df1)
write_data(dir = tempdir(), data_list = dat, season = "season-2024")
create genotype family codes
You can also create family codes from accession names using
create_family_vars()
.
f <- system.file("uon-trial-1.xlsx", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f, sheet_name = "Sheet 1") %>%
capture_location(.) %>% `[[`(1)
create_family_vars(df)[1:5, ]
#> geno loc old_family_code old_year_of_cross year_of_cross
#> 1 CIP317004.001 UON-TRIAL-1 CIP317004 2017 2021
#> 2 CIP317005.036 UON-TRIAL-1 CIP317005 2017 2021
#> 3 CIP317007.103 UON-TRIAL-1 CIP317007 2017 2021
#> 4 CIP317013.120 UON-TRIAL-1 CIP317013 2017 2021
#> 5 CIP317026.004 UON-TRIAL-1 CIP317026 2017 2021
#> new_family_code new_clone_id
#> 1 CIP321004 CIP321004.001
#> 2 CIP321005 CIP321005.036
#> 3 CIP321007 CIP321007.103
#> 4 CIP321013 CIP321013.120
#> 5 CIP321026 CIP321026.004
recode variables
Parse variables from one type to another using
recode_var()
.
f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f) %>% `[[`(1)
df_c <- recode_var(df)
run routine data checks
Run data checks using st4gi package and compute derived columns
f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f) %>%
`[[`(1)
df_checked <- run_checks(df)
check accession names
Check accession names using check_geno()
.
f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f)
df_checked <- check_geno(df)
update accession names
Update accession names with update_geno()
f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f) %>%
`[[`(1)
df_checked <- update_geno(df)
rename columns
Rename variables to match ontology lables with
rename_cols()
f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f) %>%
`[[`(1)
df <- rename_cols(df)
process trial data
Apply a number of data cleaning functions to trial data with
pre_process_trials()
A number of computed columns are generated in the process.
f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f)
df_out <- pre_process_trials(df) |> process_trials() %>%
purrr::map(., run_data_processes)
purrr::map(df_out, names_df)
#> $`UON-TRIAL-1`
#> [1] "atmw" "col" "entry"
#> [4] "flower_colour" "flowering" "flowering_date"
#> [7] "geno" "growth_speed" "late_blight_60_days"
#> [10] "mtwp" "mtyna" "n_tubers"
#> [13] "nmtp" "npe" "ntp"
#> [16] "obs" "overall_impression" "pgh"
#> [19] "photo" "plant_unif" "plant_vigor"
#> [22] "plot" "ppe" "rep"
#> [25] "row" "se" "tbfsh1"
#> [28] "tbskn1" "tub_size" "tub_unif"
#> [31] "tuber_apper" "unique"
Create a trial design object for row-col design
Create a trial design object, fit model and extract predictions
f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f) %>%
purrr::map(
., ~dplyr::mutate(
., year = "2024", loc = "UON", trial = "lbht"
)
)
df_out <- pre_process_trials(df) |> process_trials() %>%
purrr::map(., run_data_processes) %>% `[[`(1)
# trial design object
TD <- create_td(df_out, design = "rowcol")
# fit
fit_TD <- fit_td(TD, trait = "mtwp")
# extract predictions
pred_TD <- extract_blups(fit_TD, pred = c("BLUEs", "seBLUEs"))
pred_TD[[1]][1:5,]
#> genotype trial BLUEs_mtwp seBLUEs_mtwp
#> 1 CIP312084.731 lbht 10.377484 1.156141
#> 2 CIP3177011.028 lbht 9.831602 1.155003
#> 3 CIP3177023.046 lbht 6.642917 1.190464
#> 4 CIP3177038.001 lbht 4.890195 1.163434
#> 5 CIP3187009.044 lbht 13.939097 1.160588
get in/valid column names
Get invalid columns (labels not defined in ontology)
f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f) %>%
purrr::map(
., ~dplyr::mutate(
., year = "2024", loc = "UON", trial = "lbht"
)
)
df_out <- pre_process_trials(df) |> process_trials() %>%
purrr::map(., run_data_processes) %>% `[[`(1)
subset_invalid_cols(df_out) %>% colnames(.)
#> [1] "geno" "unique" "entry"
#> [4] "growth_speed" "flowering_date" "flower_colour"
#> [7] "overall_impression" "late_blight_60_days" "obs"
#> [10] "photo" "trial" "n_tubers"
read trial data files
f <- system.file("uon-trial-1.csv", package = "pbwrangler")
d <- read_workbooks(dir = NULL, file_to_read = f)
lapply(d, function(x) x[1:5, 1:5])
#> $`UON-TRIAL-1`
#> unique plot geno entry row
#> 1 1CIP317004.001 1 CIP317004.001 12 1
#> 2 2CIP317005.036 2 CIP317005.036 14 2
#> 3 3CIP317007.103 3 CIP317007.103 20 3
#> 4 4CIP317013.120 4 CIP317013.120 23 4
#> 5 5CIP317026.004 5 CIP317026.004 28 5
field trial randomization
partially replicated (prep) design
data("ilri", package = "pbwrangler")
ins_ilri <- geno_by_tubers(ilri)
lapply(ins_ilri, head, 5)
#> $geno_3
#> [1] "CIP319020.009" "CIP319027.002" "CIP319027.003" "CIP319052.003"
#> [5] "CIP319052.006"
#>
#> $geno_2
#> [1] "CIP319008.002" "CIP319019.001" "CIP319020.004" "CIP319020.005"
#> [5] "CIP319020.008"
#>
#> $geno_1
#> [1] "CIP319020.003" "CIP319020.006" "CIP319020.007" "CIP319020.002"
#> [5] "CIP319027.001"
Do partially replicated randomization
d <- tempdir()
data("ilri", package = "pbwrangler")
ins_ilri <- geno_by_tubers(ilri)
ilri_prep <- rand_Prep(
tot = 57,
ins = ins_ilri,
rowD = 12,
trial = "KE24ILR-BIO-IT01",
n_dummies = 5,
loc = "ilri",
totReps =96,
trtrepP = trial_design_meta()$trep,
trtgroup = trial_design_meta()$trgroup,
block_lst = trial_design_meta()$block_list,
path = NULL
)
#> Phase, Search%, A-measure
#> [1] 1.0000000 0.0000000 0.2718985
#> [1] 1.0000000 10.0000000 0.2666667
#> [1] 1.0000000 20.0000000 0.2666667
#> [1] 1.0000000 30.0000000 0.2666667
#> [1] 1.0000000 40.0000000 0.2666667
#> [1] 1.0000000 50.0000000 0.2666667
#> [1] 1.0000000 60.0000000 0.2666667
#> [1] 1.0000000 70.0000000 0.2666667
#> [1] 1.0000000 80.0000000 0.2666667
#> [1] 1.0000000 90.0000000 0.2666667
#> [1] 1.0000000 100.0000000 0.2666667
#> [1] 0.2666667 0.2666667 0.2666667 0.0000000 0.0000000 0.0000000 0.0000000
#> [8] 0.0000000 0.0000000 0.0000000
#> 8 rows by 2 columns
#> Phase, Search%, A-measure
#> [1] 2.0000000 0.0000000 0.2700262
#> [1] 2.0000000 10.0000000 0.2700262
#> [1] 2.0000000 20.0000000 0.2700262
#> [1] 2.0000000 30.0000000 0.2700262
#> [1] 2.0000000 40.0000000 0.2700262
#> [1] 2.0000000 50.0000000 0.2700262
#> [1] 2.0000000 60.0000000 0.2700262
#> [1] 2.0000000 70.0000000 0.2700262
#> [1] 2.0000000 80.0000000 0.2700262
#> [1] 2.0000000 90.0000000 0.2700262
#> [1] 2.0000000 100.0000000 0.2700262
#> [1] 0.2666667 0.2834646 0.2700262 0.0000000 0.0000000 0.0000000 0.0000000
#> [8] 0.0000000 0.0000000 0.0000000
#> Phase, Search%, A-measure
#> [1] 1.0000000 0.0000000 0.7591574
#> [1] 1.0000000 10.0000000 0.7437399
#> [1] 1.0000000 20.0000000 0.7437399
#> [1] 1.0000000 30.0000000 0.7437399
#> [1] 1.0000000 40.0000000 0.7437399
#> [1] 1.0000000 50.0000000 0.7437399
#> [1] 1.0000000 60.0000000 0.7437399
#> [1] 1.0000000 70.0000000 0.7437399
#> [1] 1.0000000 80.0000000 0.7437399
#> [1] 1.0000000 90.0000000 0.7437399
#> [1] 1.0000000 100.0000000 0.7437399
#> [1] 0.7390377 0.7625489 0.7437399 0.0000000 0.0000000 0.0000000 0.0000000
#> [8] 0.0000000 0.0000000 0.0000000
#> 8 rows by 2 columns
#> Phase, Search%, A-measure
#> [1] 2.0000000 0.0000000 0.7842118
#> [1] 2.0000000 10.0000000 0.7717472
#> [1] 2.0000000 20.0000000 0.7717472
#> [1] 2.0000000 30.0000000 0.7717472
#> [1] 2.0000000 40.0000000 0.7717472
#> [1] 2.0000000 50.0000000 0.7716664
#> [1] 2.0000000 60.0000000 0.7716664
#> [1] 2.0000000 70.0000000 0.7716664
#> [1] 2.0000000 80.0000000 0.7716664
#> [1] 2.0000000 90.0000000 0.7716664
#> [1] 2.0000000 100.0000000 0.7716664
#> [1] 0.7593699 0.8208523 0.7716664 0.0000000 0.0000000 0.0000000 0.0000000
#> [8] 0.0000000 0.0000000 0.0000000
#> [1] "#####################################"
#> [1] "# Final search has not yet been run #"
#> [1] "#####################################"
#> Phase, Search%, A-measure
#> [1] 1.000000 0.000000 1.896849
#> [1] 1.000000 10.000000 1.756136
#> [1] 1.00000 20.00000 1.69891
#> [1] 1.000000 30.000000 1.679198
#> [1] 1.000000 40.000000 1.656411
#> [1] 1.000000 50.000000 1.633088
#> [1] 1.000000 60.000000 1.620262
#> [1] 1.000000 70.000000 1.613022
#> [1] 1.000000 80.000000 1.600019
#> [1] 1.000000 90.000000 1.590032
#> [1] 1.000000 100.000000 1.590032
#> [1] 1.590032 1.590032 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
#> [9] 0.000000 0.000000
head(ilri_prep$design)
#> [1] NA
rowcol design
rowcol design for 4 genotypes with 2 checks
df <- data.frame(geno = LETTERS[1:4])
rcD <-
randomize_row_col(
clones = df,
trial = "KE24ILR-BW-ST01",
tot = 6,
rowD = 6,
n_dummies = 0,
to_add = 2,
rep = 3,
path = NULL
)
#> Phase, Search%, A-measure
#> [1] 1.000000 0.000000 1.002506
#> [1] 1.0000000 10.0000000 0.7919192
#> [1] 1.0000000 20.0000000 0.7919192
#> [1] 1.0000000 30.0000000 0.7919192
#> [1] 1.0000000 40.0000000 0.7919192
#> [1] 1.0000000 50.0000000 0.7919192
#> [1] 1.0000000 60.0000000 0.7919192
#> [1] 1.0000000 70.0000000 0.7919192
#> [1] 1.0000000 80.0000000 0.7919192
#> [1] 1.0000000 90.0000000 0.7919192
#> [1] 1.0000000 100.0000000 0.7919192
#> [1] 0.7919192 0.7919192 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
#> [8] 0.0000000 0.0000000 0.0000000
#> 1 row by 3 columns
#> Phase, Search%, A-measure
#> [1] 2.0000000 0.0000000 0.8554697
#> [1] 2.0000000 10.0000000 0.7919192
#> [1] 2.0000000 20.0000000 0.7919192
#> [1] 2.0000000 30.0000000 0.7919192
#> [1] 2.0000000 40.0000000 0.7919192
#> [1] 2.0000000 50.0000000 0.7919192
#> [1] 2.0000000 60.0000000 0.7919192
#> [1] 2.0000000 70.0000000 0.7919192
#> [1] 2.0000000 80.0000000 0.7919192
#> [1] 2.0000000 90.0000000 0.7919192
#> [1] 2.0000000 100.0000000 0.7919192
#> [1] 0.7919192 0.7919192 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
#> [8] 0.0000000 0.0000000 0.0000000
head(rcD$fieldbook)
#> plot geno entry row column rep
#> 1 KE24ILR-BW-ST01-00001 B 2 1 1 1
#> 2 KE24ILR-BW-ST01-00002 D 4 2 1 1
#> 3 KE24ILR-BW-ST01-00003 Sagitta 5 3 1 1
#> 4 KE24ILR-BW-ST01-00004 A 1 4 1 1
#> 5 KE24ILR-BW-ST01-00005 C 3 5 1 1
#> 6 KE24ILR-BW-ST01-00006 Sherekea 6 6 1 1
Randomize a resolvable row column of 55 clones replicated once, 4 checks replicated 4 times and 3 dummies replicated three times.
data("ilri")
dat <- ilri %>% dplyr::select(geno)
# dat %>% dplyr::pull()
rrc80 <- randomize_res_row_col(
clones = dat,
tot = 62,
trial = "KE24ILR-BW-ST01",
totReps = 80,
trtrepP = rep(c(1, 4, 3), c(55, 4, 3)),
block_lst = list(c(16,5), c(8,5)),
rowD = 16,
n_dummies = 3,
season = "season-2025",
path = NULL
)
#> Phase, Search%, A-measure
#> [1] 1.000000 0.000000 1.321959
#> [1] 1.000000 10.000000 1.303753
#> [1] 1.000000 20.000000 1.303753
#> [1] 1.000000 30.000000 1.303753
#> [1] 1.000000 40.000000 1.303753
#> [1] 1.000000 50.000000 1.303753
#> [1] 1.000000 60.000000 1.303753
#> [1] 1.000000 70.000000 1.303753
#> [1] 1.000000 80.000000 1.303753
#> [1] 1.000000 90.000000 1.303753
#> [1] 1.000000 100.000000 1.303753
#> [1] 1.303753 1.303753 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
#> [9] 0.000000 0.000000
#> 8 rows by 5 columns
#> Phase, Search%, A-measure
#> [1] 2.00000 0.00000 1.99169
#> [1] 2.000000 10.000000 1.633634
#> [1] 2.000000 20.000000 1.632903
#> [1] 2.0000 30.0000 1.6315
#> [1] 2.0000 40.0000 1.6315
#> [1] 2.000000 50.000000 1.630169
#> [1] 2.000000 60.000000 1.630169
#> [1] 2.000000 70.000000 1.630169
#> [1] 2.000000 80.000000 1.630169
#> [1] 2.000000 90.000000 1.628684
#> [1] 2.000000 100.000000 1.628684
#> [1] 1.628684 1.628684 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
#> [9] 0.000000 0.000000
head(rrc80$fieldbook)
#> plot geno entry row column rep
#> 1 KE24ILR-BW-ST01-00001 CIP319027.006 20 1 1 1
#> 2 KE24ILR-BW-ST01-00002 CIP319020.003 7 2 1 1
#> 3 KE24ILR-BW-ST01-00003 CIP319040.004 25 3 1 1
#> 4 KE24ILR-BW-ST01-00004 dummy-Unica 58 4 1 1
#> 5 KE24ILR-BW-ST01-00005 CIP319064.007 54 5 1 1
#> 6 KE24ILR-BW-ST01-00006 CIP319020.006 10 6 1 1
Randomize without any replications, more like drawing a random sample
data("ilri")
dat <- ilri %>% dplyr::select(geno)
d <- randomize_noRep(
rowD = 11,
tot = 55,
ins = dat,
path = NULL
)
head(d)
#> plot geno row col
#> 1 KE25MOL-HERL-ST01-00001 CIP319027.003 1 1
#> 2 KE25MOL-HERL-ST01-00002 CIP319064.003 2 1
#> 3 KE25MOL-HERL-ST01-00003 CIP319008.002 3 1
#> 4 KE25MOL-HERL-ST01-00004 CIP319020.004 4 1
#> 5 KE25MOL-HERL-ST01-00005 CIP319020.001 5 1
#> 6 KE25MOL-HERL-ST01-00006 CIP319052.012 6 1