Get started • pbwrangler

library(pbwrangler)

Substring first n elements

a <- c("asdf", "bnmkk")
my_left(a, num = 3)
#> [1] "asd" "bnm"

a <- c("asdf", "bnmkk")
my_right(a, num = 3)
#> [1] "sdf" "mkk"

df <- data.frame(a = NA, b = c(1:3, NA), c = 0)
drop_nas(df)
#>    b c
#> 1  1 0
#> 2  2 0
#> 3  3 0
#> 4 NA 0
drop_zeros(df)
#>    a  b
#> 1 NA  1
#> 2 NA  2
#> 3 NA  3
#> 4 NA NA

clean directory/file names

Clean file names from a directory using clean_dir_name()


f <- system.file("uon-trial-1.csv", package = "pbwrangler")
clean_dir_name(f)
#> [1] "uon-trial-1"

List files

List files in a directory using list_files()


list_files(system.file(package = "pbwrangler"))[1:5]
#> [1] "C:/Users/BasilOkola/AppData/Local/R/win-library/4.3/pbwrangler/meta-data.xlsx"                   
#> [2] "C:/Users/BasilOkola/AppData/Local/R/win-library/4.3/pbwrangler/Pedigree_advanced_clones_SSA.xlsx"
#> [3] "C:/Users/BasilOkola/AppData/Local/R/win-library/4.3/pbwrangler/uon-trial-1.csv"                  
#> [4] "C:/Users/BasilOkola/AppData/Local/R/win-library/4.3/pbwrangler/uon-trial-1.xlsx"                 
#> [5] "C:/Users/BasilOkola/AppData/Local/R/win-library/4.3/pbwrangler/year-cross.csv"

Compare dataframes to find shared columns

If you suspect two fieldbooks to be similar, you can compare them.


f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir=NULL, file_to_read = f)[[1]]
f1 <- system.file("uon-trial-1.xlsx", package = "pbwrangler")
df1 <- read_workbooks(dir=NULL, file_to_read = f, sheet_name = "Sheet1")[[1]]
compare_df(df, df1)
#>    dim_X  dim_Y design_X design_Y cols_not_X cols_not_Y
#> 1 108,32 108,32     6*18     6*18

Merge notes/obs to one column

f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir=NULL, file_to_read = f)[[1]]
merge_note_obs(df)[1:5, "obs"]
#> [1] "long oblong,,good yield" "very good yield"        
#> [3] "buds, pinkish eyes"      "good yield"             
#> [5] NA

Create a meta-data file for each experiment designed during a season

Create a meta-data to accompany fieldbooks.

f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f)
create_meta_file(df, season = "season-2024", d_dir = tempdir())
#> $`season-2024`
#> # A tibble: 1 × 17
#>   trial_name breeding_program location year  transplanting_date design_type
#>   <lgl>      <lgl>            <chr>    <chr> <lgl>              <lgl>      
#> 1 NA         NA               NA       2024  NA                 NA         
#> # ℹ 11 more variables: description <lgl>, trial_type <lgl>, plot_width <lgl>,
#> #   plot_length <lgl>, planting_date <date>, harvest_date <date>,
#> #   number_of_plants_per_ridge <lgl>, number_of_ridges_per_plot <lgl>,
#> #   space_between_ridges <lgl>, space_between_plants_in_ridges <lgl>,
#> #   number_of_plants_per_plot <lgl>

Capture trial location

You could try capture trial location from file names:

f <- system.file("uon-trial-1.xlsx", package = "pbwrangler")
f1 <- system.file("uon-trial-1.csv", package = "pbwrangler")

df <- read_workbooks(dir = NULL, file_to_read = f, sheet_name = "Sheet 1")
df1 <- read_workbooks(dir = NULL, file_to_read = f1)

dat <- c(df, df1)
# this has a loc variable
dat_loc <- capture_location(dat)
lapply(dat_loc, function(x) x$loc[1:5])
#> $`UON-TRIAL-1`
#> [1] "UON-TRIAL-1" "UON-TRIAL-1" "UON-TRIAL-1" "UON-TRIAL-1" "UON-TRIAL-1"
#> 
#> $`UON-TRIAL-1`
#> [1] "UON-TRIAL-1" "UON-TRIAL-1" "UON-TRIAL-1" "UON-TRIAL-1" "UON-TRIAL-1"

write data

You can write trial data using write_data():

f <- system.file("uon-trial-1.xlsx", package = "pbwrangler")
f1 <- system.file("uon-trial-1.csv", package = "pbwrangler")

df <- read_workbooks(dir = NULL, file_to_read = f, sheet_name = "Sheet 1")
df1 <- read_workbooks(dir = NULL, file_to_read = f1)
dat <- c(df, df1)

write_data(dir = tempdir(), data_list = dat, season = "season-2024")

create genotype family codes

You can also create family codes from accession names using create_family_vars().

f <- system.file("uon-trial-1.xlsx", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f, sheet_name = "Sheet 1") %>%
  capture_location(.) %>% `[[`(1)

create_family_vars(df)[1:5, ]
#>            geno         loc old_family_code old_year_of_cross year_of_cross
#> 1 CIP317004.001 UON-TRIAL-1       CIP317004              2017          2021
#> 2 CIP317005.036 UON-TRIAL-1       CIP317005              2017          2021
#> 3 CIP317007.103 UON-TRIAL-1       CIP317007              2017          2021
#> 4 CIP317013.120 UON-TRIAL-1       CIP317013              2017          2021
#> 5 CIP317026.004 UON-TRIAL-1       CIP317026              2017          2021
#>   new_family_code  new_clone_id
#> 1       CIP321004 CIP321004.001
#> 2       CIP321005 CIP321005.036
#> 3       CIP321007 CIP321007.103
#> 4       CIP321013 CIP321013.120
#> 5       CIP321026 CIP321026.004

clean clone/genotype names

read accession files that have been fixed and confirmed correct

organize accessions for export

recode variables

Parse variables from one type to another using recode_var().

f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f) %>% `[[`(1)
df_c <- recode_var(df)

run routine data checks

Run data checks using st4gi package and compute derived columns

f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f) %>%
  `[[`(1)
df_checked <- run_checks(df)

check accession names

Check accession names using check_geno().

f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f) 
df_checked <- check_geno(df)

update accession names

Update accession names with update_geno()

f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f) %>%
  `[[`(1)
df_checked <- update_geno(df)

rename columns

Rename variables to match ontology lables with rename_cols()

f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f) %>%
  `[[`(1)
df <- rename_cols(df)

process trial data

Apply a number of data cleaning functions to trial data with pre_process_trials()

A number of computed columns are generated in the process.

f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f) 
df_out <- pre_process_trials(df) |> process_trials() %>%
  purrr::map(., run_data_processes)
purrr::map(df_out, names_df)
#> $`UON-TRIAL-1`
#>  [1] "atmw"                "col"                 "entry"              
#>  [4] "flower_colour"       "flowering"           "flowering_date"     
#>  [7] "geno"                "growth_speed"        "late_blight_60_days"
#> [10] "mtwp"                "mtyna"               "n_tubers"           
#> [13] "nmtp"                "npe"                 "ntp"                
#> [16] "obs"                 "overall_impression"  "pgh"                
#> [19] "photo"               "plant_unif"          "plant_vigor"        
#> [22] "plot"                "ppe"                 "rep"                
#> [25] "row"                 "se"                  "tbfsh1"             
#> [28] "tbskn1"              "tub_size"            "tub_unif"           
#> [31] "tuber_apper"         "unique"

Create a trial design object for row-col design

Create a trial design object, fit model and extract predictions

f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f) %>%
  purrr::map(
    ., ~dplyr::mutate(
      ., year = "2024", loc = "UON", trial = "lbht"
    )
  )
df_out <- pre_process_trials(df) |> process_trials() %>%
  purrr::map(., run_data_processes) %>% `[[`(1) 
# trial design object
TD <- create_td(df_out, design = "rowcol")
# fit 
fit_TD <- fit_td(TD, trait = "mtwp")
# extract predictions

pred_TD <- extract_blups(fit_TD, pred = c("BLUEs", "seBLUEs"))

pred_TD[[1]][1:5,]
#>         genotype trial BLUEs_mtwp seBLUEs_mtwp
#> 1  CIP312084.731  lbht  10.377484     1.156141
#> 2 CIP3177011.028  lbht   9.831602     1.155003
#> 3 CIP3177023.046  lbht   6.642917     1.190464
#> 4 CIP3177038.001  lbht   4.890195     1.163434
#> 5 CIP3187009.044  lbht  13.939097     1.160588

get in/valid column names

Get invalid columns (labels not defined in ontology)

f <- system.file("uon-trial-1.csv", package = "pbwrangler")
df <- read_workbooks(dir = NULL, file_to_read = f) %>%
  purrr::map(
    ., ~dplyr::mutate(
      ., year = "2024", loc = "UON", trial = "lbht"
    )
  )
df_out <- pre_process_trials(df) |> process_trials() %>%
  purrr::map(., run_data_processes) %>% `[[`(1) 

subset_invalid_cols(df_out) %>% colnames(.)
#>  [1] "geno"                "unique"              "entry"              
#>  [4] "growth_speed"        "flowering_date"      "flower_colour"      
#>  [7] "overall_impression"  "late_blight_60_days" "obs"                
#> [10] "photo"               "trial"               "n_tubers"

tissue culture helpers

read trial data files

f <- system.file("uon-trial-1.csv", package = "pbwrangler")
d <- read_workbooks(dir = NULL, file_to_read = f)
lapply(d, function(x) x[1:5, 1:5])
#> $`UON-TRIAL-1`
#>           unique plot          geno entry row
#> 1 1CIP317004.001    1 CIP317004.001    12   1
#> 2 2CIP317005.036    2 CIP317005.036    14   2
#> 3 3CIP317007.103    3 CIP317007.103    20   3
#> 4 4CIP317013.120    4 CIP317013.120    23   4
#> 5 5CIP317026.004    5 CIP317026.004    28   5

field trial randomization

partially replicated (prep) design

data("ilri", package = "pbwrangler")
ins_ilri <- geno_by_tubers(ilri)
lapply(ins_ilri, head, 5)
#> $geno_3
#> [1] "CIP319020.009" "CIP319027.002" "CIP319027.003" "CIP319052.003"
#> [5] "CIP319052.006"
#> 
#> $geno_2
#> [1] "CIP319008.002" "CIP319019.001" "CIP319020.004" "CIP319020.005"
#> [5] "CIP319020.008"
#> 
#> $geno_1
#> [1] "CIP319020.003" "CIP319020.006" "CIP319020.007" "CIP319020.002"
#> [5] "CIP319027.001"

Do partially replicated randomization

d <- tempdir()
data("ilri", package = "pbwrangler")
ins_ilri <- geno_by_tubers(ilri)
ilri_prep <- rand_Prep(
  tot = 57,
  ins = ins_ilri,
  rowD = 12,
  trial = "KE24ILR-BIO-IT01",
  n_dummies = 5,
  loc = "ilri",
  totReps =96,
  trtrepP = trial_design_meta()$trep,
  trtgroup = trial_design_meta()$trgroup,
  block_lst = trial_design_meta()$block_list,
  path = NULL
)
#>      Phase,    Search%,    A-measure
#> [1] 1.0000000 0.0000000 0.2718985
#> [1]  1.0000000 10.0000000  0.2666667
#> [1]  1.0000000 20.0000000  0.2666667
#> [1]  1.0000000 30.0000000  0.2666667
#> [1]  1.0000000 40.0000000  0.2666667
#> [1]  1.0000000 50.0000000  0.2666667
#> [1]  1.0000000 60.0000000  0.2666667
#> [1]  1.0000000 70.0000000  0.2666667
#> [1]  1.0000000 80.0000000  0.2666667
#> [1]  1.0000000 90.0000000  0.2666667
#> [1]   1.0000000 100.0000000   0.2666667
#>  [1] 0.2666667 0.2666667 0.2666667 0.0000000 0.0000000 0.0000000 0.0000000
#>  [8] 0.0000000 0.0000000 0.0000000
#> 8 rows by 2 columns
#>      Phase,    Search%,    A-measure
#> [1] 2.0000000 0.0000000 0.2700262
#> [1]  2.0000000 10.0000000  0.2700262
#> [1]  2.0000000 20.0000000  0.2700262
#> [1]  2.0000000 30.0000000  0.2700262
#> [1]  2.0000000 40.0000000  0.2700262
#> [1]  2.0000000 50.0000000  0.2700262
#> [1]  2.0000000 60.0000000  0.2700262
#> [1]  2.0000000 70.0000000  0.2700262
#> [1]  2.0000000 80.0000000  0.2700262
#> [1]  2.0000000 90.0000000  0.2700262
#> [1]   2.0000000 100.0000000   0.2700262
#>  [1] 0.2666667 0.2834646 0.2700262 0.0000000 0.0000000 0.0000000 0.0000000
#>  [8] 0.0000000 0.0000000 0.0000000
#>      Phase,    Search%,    A-measure
#> [1] 1.0000000 0.0000000 0.7591574
#> [1]  1.0000000 10.0000000  0.7437399
#> [1]  1.0000000 20.0000000  0.7437399
#> [1]  1.0000000 30.0000000  0.7437399
#> [1]  1.0000000 40.0000000  0.7437399
#> [1]  1.0000000 50.0000000  0.7437399
#> [1]  1.0000000 60.0000000  0.7437399
#> [1]  1.0000000 70.0000000  0.7437399
#> [1]  1.0000000 80.0000000  0.7437399
#> [1]  1.0000000 90.0000000  0.7437399
#> [1]   1.0000000 100.0000000   0.7437399
#>  [1] 0.7390377 0.7625489 0.7437399 0.0000000 0.0000000 0.0000000 0.0000000
#>  [8] 0.0000000 0.0000000 0.0000000
#> 8 rows by 2 columns
#>      Phase,    Search%,    A-measure
#> [1] 2.0000000 0.0000000 0.7842118
#> [1]  2.0000000 10.0000000  0.7717472
#> [1]  2.0000000 20.0000000  0.7717472
#> [1]  2.0000000 30.0000000  0.7717472
#> [1]  2.0000000 40.0000000  0.7717472
#> [1]  2.0000000 50.0000000  0.7716664
#> [1]  2.0000000 60.0000000  0.7716664
#> [1]  2.0000000 70.0000000  0.7716664
#> [1]  2.0000000 80.0000000  0.7716664
#> [1]  2.0000000 90.0000000  0.7716664
#> [1]   2.0000000 100.0000000   0.7716664
#>  [1] 0.7593699 0.8208523 0.7716664 0.0000000 0.0000000 0.0000000 0.0000000
#>  [8] 0.0000000 0.0000000 0.0000000
#> [1] "#####################################"
#> [1] "# Final search has not yet been run #"
#> [1] "#####################################"
#>      Phase,    Search%,    A-measure
#> [1] 1.000000 0.000000 1.896849
#> [1]  1.000000 10.000000  1.756136
#> [1]  1.00000 20.00000  1.69891
#> [1]  1.000000 30.000000  1.679198
#> [1]  1.000000 40.000000  1.656411
#> [1]  1.000000 50.000000  1.633088
#> [1]  1.000000 60.000000  1.620262
#> [1]  1.000000 70.000000  1.613022
#> [1]  1.000000 80.000000  1.600019
#> [1]  1.000000 90.000000  1.590032
#> [1]   1.000000 100.000000   1.590032
#>  [1] 1.590032 1.590032 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
#>  [9] 0.000000 0.000000
head(ilri_prep$design)
#> [1] NA

rowcol design

rowcol design for 4 genotypes with 2 checks

df <- data.frame(geno = LETTERS[1:4])
rcD <-
  randomize_row_col(
    clones = df,
    trial = "KE24ILR-BW-ST01",
    tot = 6,
    rowD = 6,
    n_dummies = 0,
    to_add = 2,
    rep = 3,
    path = NULL
  )
#>      Phase,    Search%,    A-measure
#> [1] 1.000000 0.000000 1.002506
#> [1]  1.0000000 10.0000000  0.7919192
#> [1]  1.0000000 20.0000000  0.7919192
#> [1]  1.0000000 30.0000000  0.7919192
#> [1]  1.0000000 40.0000000  0.7919192
#> [1]  1.0000000 50.0000000  0.7919192
#> [1]  1.0000000 60.0000000  0.7919192
#> [1]  1.0000000 70.0000000  0.7919192
#> [1]  1.0000000 80.0000000  0.7919192
#> [1]  1.0000000 90.0000000  0.7919192
#> [1]   1.0000000 100.0000000   0.7919192
#>  [1] 0.7919192 0.7919192 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
#>  [8] 0.0000000 0.0000000 0.0000000
#> 1 row by 3 columns
#>      Phase,    Search%,    A-measure
#> [1] 2.0000000 0.0000000 0.8554697
#> [1]  2.0000000 10.0000000  0.7919192
#> [1]  2.0000000 20.0000000  0.7919192
#> [1]  2.0000000 30.0000000  0.7919192
#> [1]  2.0000000 40.0000000  0.7919192
#> [1]  2.0000000 50.0000000  0.7919192
#> [1]  2.0000000 60.0000000  0.7919192
#> [1]  2.0000000 70.0000000  0.7919192
#> [1]  2.0000000 80.0000000  0.7919192
#> [1]  2.0000000 90.0000000  0.7919192
#> [1]   2.0000000 100.0000000   0.7919192
#>  [1] 0.7919192 0.7919192 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
#>  [8] 0.0000000 0.0000000 0.0000000
head(rcD$fieldbook)
#>                    plot     geno entry row column rep
#> 1 KE24ILR-BW-ST01-00001        B     2   1      1   1
#> 2 KE24ILR-BW-ST01-00002        D     4   2      1   1
#> 3 KE24ILR-BW-ST01-00003  Sagitta     5   3      1   1
#> 4 KE24ILR-BW-ST01-00004        A     1   4      1   1
#> 5 KE24ILR-BW-ST01-00005        C     3   5      1   1
#> 6 KE24ILR-BW-ST01-00006 Sherekea     6   6      1   1

Randomize a resolvable row column of 55 clones replicated once, 4 checks replicated 4 times and 3 dummies replicated three times.



data("ilri")
dat <- ilri %>% dplyr::select(geno) 
# dat %>% dplyr::pull() 


rrc80 <- randomize_res_row_col(
  clones = dat,
  tot = 62,
  trial = "KE24ILR-BW-ST01",
  totReps = 80,
  trtrepP = rep(c(1, 4, 3), c(55, 4, 3)),
  block_lst = list(c(16,5), c(8,5)),
  rowD = 16,
  n_dummies = 3,
  season = "season-2025",
  path = NULL
  
)
#>      Phase,    Search%,    A-measure
#> [1] 1.000000 0.000000 1.321959
#> [1]  1.000000 10.000000  1.303753
#> [1]  1.000000 20.000000  1.303753
#> [1]  1.000000 30.000000  1.303753
#> [1]  1.000000 40.000000  1.303753
#> [1]  1.000000 50.000000  1.303753
#> [1]  1.000000 60.000000  1.303753
#> [1]  1.000000 70.000000  1.303753
#> [1]  1.000000 80.000000  1.303753
#> [1]  1.000000 90.000000  1.303753
#> [1]   1.000000 100.000000   1.303753
#>  [1] 1.303753 1.303753 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
#>  [9] 0.000000 0.000000
#> 8 rows by 5 columns
#>      Phase,    Search%,    A-measure
#> [1] 2.00000 0.00000 1.99169
#> [1]  2.000000 10.000000  1.633634
#> [1]  2.000000 20.000000  1.632903
#> [1]  2.0000 30.0000  1.6315
#> [1]  2.0000 40.0000  1.6315
#> [1]  2.000000 50.000000  1.630169
#> [1]  2.000000 60.000000  1.630169
#> [1]  2.000000 70.000000  1.630169
#> [1]  2.000000 80.000000  1.630169
#> [1]  2.000000 90.000000  1.628684
#> [1]   2.000000 100.000000   1.628684
#>  [1] 1.628684 1.628684 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
#>  [9] 0.000000 0.000000

head(rrc80$fieldbook)
#>                    plot          geno entry row column rep
#> 1 KE24ILR-BW-ST01-00001 CIP319027.006    20   1      1   1
#> 2 KE24ILR-BW-ST01-00002 CIP319020.003     7   2      1   1
#> 3 KE24ILR-BW-ST01-00003 CIP319040.004    25   3      1   1
#> 4 KE24ILR-BW-ST01-00004   dummy-Unica    58   4      1   1
#> 5 KE24ILR-BW-ST01-00005 CIP319064.007    54   5      1   1
#> 6 KE24ILR-BW-ST01-00006 CIP319020.006    10   6      1   1

Randomize without any replications, more like drawing a random sample


data("ilri")
dat <- ilri %>% dplyr::select(geno) 

d <- randomize_noRep(
  rowD = 11,
  tot = 55,
  ins = dat,
  path = NULL
)

head(d)
#>                      plot          geno row col
#> 1 KE25MOL-HERL-ST01-00001 CIP319027.003   1   1
#> 2 KE25MOL-HERL-ST01-00002 CIP319064.003   2   1
#> 3 KE25MOL-HERL-ST01-00003 CIP319008.002   3   1
#> 4 KE25MOL-HERL-ST01-00004 CIP319020.004   4   1
#> 5 KE25MOL-HERL-ST01-00005 CIP319020.001   5   1
#> 6 KE25MOL-HERL-ST01-00006 CIP319052.012   6   1