3  Baseline Testing

Purpose: test baseline reporting groups for accuracy and efficiency using 100% simulations, self-assignment tests, and simulated mixtures. up

3.1 Data

Load raw genotype data, sibling IDs to drop (full siblings), and gps locations (to sort)

Code
data <- read_csv("/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/usr_gsi_gtseq_plusmeta.csv")
drop_sibs <- read_csv("/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Relatedness/UpperSnakeRiver_GTseq_Baseline_DropFullSiblings.csv")
gps <- read_csv("/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Landscape Covariates/RepUnit_LatLong.csv")

gps_sp <- sf::st_as_sf(gps, coords = c("lon", "lat"), crs = 4326)
mapview::mapview(gps_sp, col.regions = "blue", legend = FALSE)

3.1.1 Format data

Code
gt_mat <- data[,-c(1:3)] # pull out sequence data
gt_mat[gt_mat == 0] <- NA # change the missing data to NA

# ID rows corresponding to baseline vs mixture samples
ref_indv <- which(data$Category == "baseline") 
mix_indv <- which(data$Category == "mixture") 

# create data for sample_type column 
sample_type <- rep("reference", nrow(data))
sample_type[mix_indv] <- "mixture"

# create data for reporting unit column
repunit <- data$Pop
repunit[mix_indv] <- NA

# create data for collection column (same as reporting unit, except for mixture samples)
collection <- data$Pop

# IDFG EFGL sample ID
indiv <- data$SampNumber

# combine into singe data frame
input_data <- tibble(sample_type, repunit, collection, indiv)
all_input <- bind_cols(input_data, gt_mat)

# ensure that baseline individuals that are likely BKT are not included (4 from NF Spread Ck and 1 from Upper Mainstem Spread Ck)
all_input <- all_input %>% filter(!indiv %in% c("OclVARI23G_0013", "OclVARI23G_0012", "OclVARI23G_0010", "OclVARI23G_0011", "OclVARI23G_0099"))

3.1.2 Filter data

Drop putative “mixture” reporting groups

  • snake_murieranch collection from ~ 2km downstream of UBBC/Blacktail…preliminary self-assignment tests show these samples assign to those streams
  • lake_na collection from upper lake creek…preliminary self-assignment tests show these samples assign to UBBC, Blacktail, and Cottonwood.
    • Which makes sense as pretty much all flow at this location derives from Granite Diversion Ditch, which pulls water from just downstream of Moose.
    • Additionally, spawning and rearing capacity is likely very limited in this system as flow is all but turned off by late summer/early fall after irrigation season.
Code
all_input <- all_input %>% filter(!repunit %in% c("snake_murieranch", "lake_NA"))
unique(all_input$repunit)
 [1] "bacon_NA"                   "bailey_NA"                 
 [3] "bear_NA"                    "blackrock_lower"           
 [5] "blackrock_upper"            "blacktail_NA"              
 [7] "blindbull_NA"               "bluecrane_ford"            
 [9] "boulder_NA"                 "box_NA"                    
[11] "cabin_NA"                   "clear_NA"                  
[13] "cliff_NA"                   "coburn_NA"                 
[15] "cody_moment"                "cottonwood_grosventre"     
[17] "cottonwood_nps"             "cowboycabin_NA"            
[19] "cowboycabin_springchannels" "crystal_lower"             
[21] "crystal_upper"              "deadmansbar_NA"            
[23] "dell_NA"                    "ditch_NA"                  
[25] "dog_NA"                     "fall_lower"                
[27] "fall_upper"                 "fish_grosventre"           
[29] "fish_lower"                 "fish_upper"                
[31] "flagstaff_NA"               "flat_NA"                   
[33] "fordspring_NA"              "goosewing_NA"              
[35] "granite_lower"              "granite_upper"             
[37] "grosventre_lower"           "hoback_upper"              
[39] "horse_NA"                   "jack_NA"                   
[41] "lava_NA"                    "leidy_NA"                  
[43] "littlegreys_lower"          "littlegreys_upper"         
[45] "lowerbarbc_NA"              "mosquito_NA"               
[47] "northbuffalofork_NA"        "deadman_greys"             
[49] "pacific_lower"              "pacific_upper"             
[51] "rock_NA"                    "schwabacher_NA"            
[53] "sheep_NA"                   "shoal_NA"                  
[55] "slate_lower"                "slate_upper"               
[57] NA                           "snakeriversidechannel_NA"  
[59] "southbuffalofork_NA"        "spread_northfork"          
[61] "spread_southfork"           "spread_uppermainstem"      
[63] "spring_nps"                 "spring_tss"                
[65] "steer_NA"                   "threechannel_eastfork"     
[67] "threechannel_westfork"      "upperbarbc_NA"             
[69] "white_NA"                   "willow_NA"                 

Drop big sibling groups (see BaselineRelatedness.R): retain 2 full siblings per family per Ostergren et al. (2020) Molecular Ecology Resources

Code
all_input <- all_input %>% filter(!indiv %in% drop_sibs$drop_sibs)

Write out data files

Code
# write out
write_csv(all_input, "/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/UpperSnakeRiver_GTseq_InputData_NoSibs.csv")
all_input <- read_csv("/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/UpperSnakeRiver_GTseq_InputData_NoSibs.csv")

3.2 Define reporting units

Drop and redefine reporting units based on sample size, geographic proximity, and low genetic differentiation among proximate populations.

First, identify and combine reporting groups with low sample sizes.

Code
all_input %>% filter(!is.na(repunit)) %>% group_by(repunit) %>% summarise(ct = n()) %>% ungroup() %>% reactable() %>% add_title("Raw, unaltered reporting unit (population) sample sizes.")

Raw, unaltered reporting unit (population) sample sizes.

Redefine reporting groups for groups with ~low sample size AND low self-assignment rates calculated from raw data OR collections from different locations within the same stream.

After running 2nd round of self-assignment tests, drop the groupings that led to reduced self-assignment rates relative to Will’s original work treating collections as reporting groups

Code
all_input$repunit[which(all_input$repunit == "bluecrane_ford")] <- "cody_bluecrane"
all_input$repunit[which(all_input$repunit == "cody_moment")] <- "cody_bluecrane"

all_input$repunit[which(all_input$repunit == "cowboycabin_springchannels")] <- "cowboycabin_NA"

all_input$repunit[which(all_input$repunit == "threechannel_eastfork")] <- "threechannel_NA"
all_input$repunit[which(all_input$repunit == "threechannel_westfork")] <- "threechannel_NA"

all_input$repunit[which(all_input$repunit == "fish_lower")] <- "fish_NA"
all_input$repunit[which(all_input$repunit == "fish_upper")] <- "fish_NA"

all_input$repunit[which(all_input$repunit == "slate_lower")] <- "slate_NA"
all_input$repunit[which(all_input$repunit == "slate_upper")] <- "slate_NA"

all_input$repunit[which(all_input$repunit == "spread_northfork")] <- "spreadnf_flagstaff"
all_input$repunit[which(all_input$repunit == "flagstaff_NA")] <- "spreadnf_flagstaff"

all_input$repunit[which(all_input$repunit == "fall_upper")] <- "fall_coburn"
all_input$repunit[which(all_input$repunit == "fall_lower")] <- "fall_coburn"
all_input$repunit[which(all_input$repunit == "coburn_NA")] <- "fall_coburn"

all_input$repunit[which(all_input$repunit == "littlegreys_lower")] <- "littlegreys_steer"
all_input$repunit[which(all_input$repunit == "littlegreys_upper")] <- "littlegreys_steer"
all_input$repunit[which(all_input$repunit == "steer_NA")] <- "littlegreys_steer"

all_input$repunit[which(all_input$repunit == "pacific_lower")] <- "pacific_NA"
all_input$repunit[which(all_input$repunit == "pacific_upper")] <- "pacific_NA"

# drop reporting groups that cannot be easily combined
all_input <- all_input %>% filter(!repunit %in% c("schwabacher_NA", "southbuffalofork_NA", "jack_NA", "bacon_NA", "bear_NA", "sheep_NA"))
Code
all_input %>% group_by(repunit) %>% summarise(ct = n()) %>% ungroup() %>% reactable() %>% add_title("Final/combined reporting unit (population) sample sizes.")

Final/combined reporting unit (population) sample sizes.

Write out cleaned data

Code
write_csv(all_input, "/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/UpperSnakeRiver_GTseq_InputData_NoSibs_clean.csv")
all_input <- read_csv("/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/UpperSnakeRiver_GTseq_InputData_NoSibs_clean.csv")

3.3 Delineate sample type

Separate reference and mixture samples

Code
ref_input <- all_input[which(all_input$sample_type == "reference"),]
mix_input <- all_input[which(all_input$sample_type == "mixture"),]
ref_input <- ref_input %>% mutate_if(is.double, as.integer) # make sure everything is an interger
mix_input <- mix_input %>% mutate_if(is.double, as.integer) # make sure everything is an interger

# view reporting units
unique(ref_input$repunit)
 [1] "bailey_NA"                "blackrock_lower"         
 [3] "blackrock_upper"          "blacktail_NA"            
 [5] "blindbull_NA"             "cody_bluecrane"          
 [7] "boulder_NA"               "box_NA"                  
 [9] "cabin_NA"                 "clear_NA"                
[11] "cliff_NA"                 "fall_coburn"             
[13] "cottonwood_grosventre"    "cottonwood_nps"          
[15] "cowboycabin_NA"           "crystal_lower"           
[17] "crystal_upper"            "deadmansbar_NA"          
[19] "dell_NA"                  "ditch_NA"                
[21] "dog_NA"                   "fish_grosventre"         
[23] "fish_NA"                  "spreadnf_flagstaff"      
[25] "flat_NA"                  "fordspring_NA"           
[27] "goosewing_NA"             "granite_lower"           
[29] "granite_upper"            "grosventre_lower"        
[31] "hoback_upper"             "horse_NA"                
[33] "lava_NA"                  "leidy_NA"                
[35] "littlegreys_steer"        "lowerbarbc_NA"           
[37] "mosquito_NA"              "northbuffalofork_NA"     
[39] "deadman_greys"            "pacific_NA"              
[41] "rock_NA"                  "shoal_NA"                
[43] "slate_NA"                 "snakeriversidechannel_NA"
[45] "spread_southfork"         "spread_uppermainstem"    
[47] "spring_nps"               "spring_tss"              
[49] "threechannel_NA"          "upperbarbc_NA"           
[51] "white_NA"                 "willow_NA"               

Write out and read in reference and mixture data files

Code
write_csv(ref_input, "/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/UpperSnakeRiver_GTseq_InputData_NoSibs_clean_baseline.csv")
write_csv(mix_input, "/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/UpperSnakeRiver_GTseq_InputData_NoSibs_clean_mixture.csv")

ref_input <- read_csv("/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/UpperSnakeRiver_GTseq_InputData_NoSibs_clean_baseline.csv")
mix_input <- read_csv("/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/UpperSnakeRiver_GTseq_InputData_NoSibs_clean_mixture.csv")

ref_input <- ref_input %>% mutate_if(is.double, as.integer) # make sure everything is an interger
mix_input <- mix_input %>% mutate_if(is.double, as.integer) # make sure everything is an interger

3.4 Self-assignment tests

3.4.1 Run self-assignment

Run the self-assignment tests and summarize results by individual, saving only the most likely inferred rep unit:

Code
self_test <- self_assign(ref_input, 5)
Summary Statistics:

1710 Individuals in Sample

266 Loci: Amish_ONCB_RAD_10777, Amish_ONCB_RAD_31309, Amish_YCT_RAD_VAR_10439, Amish_YCT_RAD_VAR_524, Amish_YCT_RAD_VAR_5315, BCT_Amish_ONCB_RAD_44581, BCT_Amish_RAD_VAR_123, BCT_Amish_RAD_VAR_147, BCT_Amish_RAD_VAR_162, BCT_Amish_RAD_VAR_22888, BCT_Amish_RAD_VAR_47, Houston_Oc12653_72, Houston_Oc14458_107, Houston_Oc15873_229, Houston_Oc26415_290, Houston_Oc29335_82, Houston_Oc32144_118, Houston_Oc4332_250, Houston_Oc7137_365, IDFG_YCT_v1_Ocl10752_37, IDFG_YCT_v1_Ocl14778_68, IDFG_YCT_v1_Ocl15057_51, IDFG_YCT_v1_Ocl19676_45, IDFG_YCT_v1_Ocl22037_73, IDFG_YCT_v1_Ocl31603_69, IDFG_YCT_v1_Ocl34332_29, IDFG_YCT_v1_Ocl34660_49, IDFG_YCT_v1_Ocl36010_70, IDFG_YCT_v1_Ocl37592_62, IDFG_YCT_v1_Ocl40511_53, IDFG_YCT_v1_Ocl42695_26, IDFG_YCT_v1_Ocl46627_28, IDFG_YCT_v1_Ocl59008_57, IDFG_YCT_v1_Ocl60310_26, IDFG_YCT_v1_Ocl61681_32, IDFG_YCT_v1_Ocl63987_50, IDFG_YCT_v1_Ocl71066_31, IDFG_YCT_v1_Ocl71780_28, IDFG_YCT_v1_Ocl76788_41, IDFG_YCT_v1_Ocl77114_57, IDFG_YCT_v1_Ocl77417_51, IDFG_YCT_v1_Ocl81780_58, IDFG_YCT_v1_Ocl84360_65, IDFG_YCT_v1_Ocl84938_50, IDFG_YCT_v1_Ocl88159_69, IDFG_YCT_v1_Ocl91407_50, IDFG_YCT_v1_Ocl94063_37, IDFG_YCT_v2_Ocl11239_43, IDFG_YCT_v2_Ocl11315_52, IDFG_YCT_v2_Ocl11368_65, IDFG_YCT_v2_Ocl12924_45, IDFG_YCT_v2_Ocl13566_60, IDFG_YCT_v2_Ocl1388_66, IDFG_YCT_v2_Ocl1427_32, IDFG_YCT_v2_Ocl14551_39, IDFG_YCT_v2_Ocl16282_49, IDFG_YCT_v2_Ocl17893_44, IDFG_YCT_v2_Ocl2084_35, IDFG_YCT_v2_Ocl2146_52, IDFG_YCT_v2_Ocl21864_63, IDFG_YCT_v2_Ocl23695_55, IDFG_YCT_v2_Ocl2550_72, IDFG_YCT_v2_Ocl26454_40, IDFG_YCT_v2_Ocl265_70, IDFG_YCT_v2_Ocl27300_47, IDFG_YCT_v2_Ocl27652_74, IDFG_YCT_v2_Ocl30726_53, IDFG_YCT_v2_Ocl31182_39, IDFG_YCT_v2_Ocl32300_38, IDFG_YCT_v2_Ocl34053_56, IDFG_YCT_v2_Ocl36520_63, IDFG_YCT_v2_Ocl37211_48, IDFG_YCT_v2_Ocl38706_58, IDFG_YCT_v2_Ocl40834_41, IDFG_YCT_v2_Ocl422_41, IDFG_YCT_v2_Ocl42464_53, IDFG_YCT_v2_Ocl43873_47, IDFG_YCT_v2_Ocl44431_43, IDFG_YCT_v2_Ocl44830_68, IDFG_YCT_v2_Ocl46339_34, IDFG_YCT_v2_Ocl49447_63, IDFG_YCT_v2_Ocl49571_62, IDFG_YCT_v2_Ocl50017_60, IDFG_YCT_v2_Ocl50071_45, IDFG_YCT_v2_Ocl519_41, IDFG_YCT_v2_Ocl52778_33, IDFG_YCT_v2_Ocl54193_34, IDFG_YCT_v2_Ocl54206_70, IDFG_YCT_v2_Ocl5447_29, IDFG_YCT_v2_Ocl54533_63, IDFG_YCT_v2_Ocl55445_29, IDFG_YCT_v2_Ocl55778_72, IDFG_YCT_v2_Ocl55912_68, IDFG_YCT_v2_Ocl55949_70, IDFG_YCT_v2_Ocl56279_67, IDFG_YCT_v2_Ocl56509_52, IDFG_YCT_v2_Ocl60251_33, IDFG_YCT_v2_Ocl6060_34, IDFG_YCT_v2_Ocl60614_28, IDFG_YCT_v2_Ocl62064_52, IDFG_YCT_v2_Ocl63356_62, IDFG_YCT_v2_Ocl6421_62, IDFG_YCT_v2_Ocl64283_57, IDFG_YCT_v2_Ocl65458_43, IDFG_YCT_v2_Ocl65556_55, IDFG_YCT_v2_Ocl66566_29, IDFG_YCT_v2_Ocl67304_74, IDFG_YCT_v2_Ocl67496_54, IDFG_YCT_v2_Ocl68734_33, IDFG_YCT_v2_Ocl70919_58, IDFG_YCT_v2_Ocl72577_41, IDFG_YCT_v2_Ocl74070_69, IDFG_YCT_v2_Ocl7485_51, IDFG_YCT_v2_Ocl75459_71, IDFG_YCT_v2_Ocl76462_53, IDFG_YCT_v2_Ocl77748_54, IDFG_YCT_v2_Ocl78611_27, IDFG_YCT_v2_Ocl81521_63, IDFG_YCT_v2_Ocl82356_38, IDFG_YCT_v2_Ocl85546_44, IDFG_YCT_v2_Ocl85788_72, IDFG_YCT_v2_Ocl88352_46, IDFG_YCT_v2_Ocl89557_55, IDFG_YCT_v2_Ocl89640_61, IDFG_YCT_v2_Ocl90760_67, IDFG_YCT_v2_Ocl91717_43, IDFG_YCT_v2_Ocl92711_46, IDFG_YCT_v2_Ocl92807_65, IDFG_YCT_v2_Ocl94003_28, IDFG_YCT_v2_Ocl95170_53, IDFG_YCT_v2_Ocl9569_56, IDFG_YCT_v2_Ocl96084_64, Ocl_Chr01_11904953_886586, Ocl_Chr01_1245617_71241, Ocl_Chr01_23856908_2184084, Ocl_Chr01_4263947_240975, Ocl_Chr01_47294360_4844114, Ocl_Chr01_49304682_5059357, Ocl_Chr01_65204432_6778384, Ocl_Chr02_10093148_9366995, Ocl_Chr02_14036883_9774445, Ocl_Chr02_41111233_12592907, Ocl_Chr02_58755685_14645010, Ocl_Chr02_76311531_16531892, Ocl_Chr03_36100207_20027295, Ocl_Chr03_46187757_21021777, Ocl_Chr03_54445579_22006371, Ocl_Chr03_60635751_22668909, Ocl_Chr03_72820636_23938332, Ocl_Chr03_79199746_24455183, Ocl_Chr04_28783021_27639317, Ocl_Chr04_33023764_28088603, Ocl_Chr04_34213356_28219597, Ocl_Chr04_37222382_28545529, Ocl_Chr04_4678489_25012683, Ocl_Chr04_66392742_31700304, Ocl_Chr04_76210634_32621008, Ocl_Chr05_18555027_34575154, Ocl_Chr05_22353480_34975528, Ocl_Chr05_42479354_37280182, Ocl_Chr05_61430172_39368537, Ocl_Chr05_90410498_42131005, Ocl_Chr06_12420435_43129369, Ocl_Chr06_28577594_45059601, Ocl_Chr06_32944093_45564942, Ocl_Chr06_35552627_45862909, Ocl_Chr06_49347522_47229937, Ocl_Chr06_56703154_48022435, Ocl_Chr06_64424167_48985347, Ocl_Chr06_71071691_49699566, Ocl_Chr07_14856486_51326380, Ocl_Chr07_45996063_54719254, Ocl_Chr08_14771460_58941711, Ocl_Chr08_23127002_59937218, Ocl_Chr08_33342876_61016343, Ocl_Chr08_48135323_62498005, Ocl_Chr08_74452534_65317301, Ocl_Chr08_7913543_58216480, Ocl_Chr09_12790880_66825607, Ocl_Chr09_132589_65891972, Ocl_Chr09_14235478_66977774, Ocl_Chr09_21685484_67771579, Ocl_Chr09_39167685_69625176, Ocl_Chr09_41620059_69897246, Ocl_Chr09_48490705_70660510, Ocl_Chr09_49571462_70771512, Ocl_Chr09_55763419_71454240, Ocl_Chr10_22333094_74392487, Ocl_Chr10_32709329_75590026, Ocl_Chr10_35395407_75918540, Ocl_Chr10_53349684_77957625, Ocl_Chr10_54394964_78034790, Ocl_Chr11_18904366_80819009, Ocl_Chr11_21392774_81080130, Ocl_Chr11_35455748_82543634, Ocl_Chr11_39252193_82950896, Ocl_Chr11_54823421_84757779, Ocl_Chr11_6830863_79676696, Ocl_Chr12_30853101_89944910, Ocl_Chr13_34569062_98344570, Ocl_Chr13_35713851_98470555, Ocl_Chr14_20673116_102159336, Ocl_Chr14_62417220_105680075, Ocl_Chr14_80074953_107013785, Ocl_Chr15_19410515_108579195, Ocl_Chr15_3073223_107182434, Ocl_Chr15_39936653_110688475, Ocl_Chr16_14068105_113855290, Ocl_Chr16_19294381_114484561, Ocl_Chr16_21297075_114698152, Ocl_Chr16_5821040_113058367, Ocl_Chr16_63319498_119126301, Ocl_Chr16_65382802_119281196, Ocl_Chr17_20643875_121278476, Ocl_Chr17_26126051_121852980, Ocl_Chr17_36673466_122916670, Ocl_Chr17_46042425_123978835, Ocl_Chr17_58461865_125358635, Ocl_Chr17_7063635_119896425, Ocl_Chr17_9605415_120120519, Ocl_Chr18_26129647_129072958, Ocl_Chr18_34123442_129934400, Ocl_Chr18_35134435_130052437, Ocl_Chr18_36418251_130188244, Ocl_Chr18_38639143_130419135, Ocl_Chr18_44149294_130969040, Ocl_Chr18_45469522_131108058, Ocl_Chr19_22044194_134075134, Ocl_Chr19_28187340_134737135, Ocl_Chr19_32462945_135228332, Ocl_Chr19_33933079_135391160, Ocl_Chr19_44285651_136670064, Ocl_Chr19_58647860_137848971, Ocl_Chr20_19804435_139941027, Ocl_Chr20_31497051_141180625, Ocl_Chr20_36341695_141577269, Ocl_Chr20_38432054_141654148, Ocl_Chr20_6483466_138507676, Ocl_Chr21_45679568_145746560, Ocl_Chr21_50128962_145994829, Ocl_Chr22_8259981_146905324, Ocl_Chr23_14630597_152413591, Ocl_Chr24_16906646_157657003, Ocl_Chr24_3186803_156112670, Ocl_Chr24_39049170_159471644, Ocl_Chr25_11233782_160156892, Ocl_Chr25_24758789_161407476, Ocl_Chr25_28253023_161776112, Ocl_Chr25_44677311_163459109, Ocl_Chr25_60480751_165272387, Ocl_Chr25_69908699_166328532, Ocl_Chr26_24390262_169834458, Ocl_Chr27_16961377_172557845, Ocl_Chr27_22043133_173151182, Ocl_Chr27_26384321_173647407, Ocl_Chr28_15322293_176508570, Ocl_Chr28_30520066_178158297, Ocl_Chr28_36489115_178685376, Ocl_Chr29_14439470_180329363, Ocl_Chr29_17780195_180714945, Ocl_Chr29_20234359_180988711, Ocl_Chr29_23441334_181337945, Ocl_Chr29_38743868_182891893, Ocl_Chr29_9370320_179810673, OclYSD123205_Garza, X2017SDYCUT

52 Reporting Units: bailey_NA, blackrock_lower, blackrock_upper, blacktail_NA, blindbull_NA, cody_bluecrane, boulder_NA, box_NA, cabin_NA, clear_NA, cliff_NA, fall_coburn, cottonwood_grosventre, cottonwood_nps, cowboycabin_NA, crystal_lower, crystal_upper, deadmansbar_NA, dell_NA, ditch_NA, dog_NA, fish_grosventre, fish_NA, spreadnf_flagstaff, flat_NA, fordspring_NA, goosewing_NA, granite_lower, granite_upper, grosventre_lower, hoback_upper, horse_NA, lava_NA, leidy_NA, littlegreys_steer, lowerbarbc_NA, mosquito_NA, northbuffalofork_NA, deadman_greys, pacific_NA, rock_NA, shoal_NA, slate_NA, snakeriversidechannel_NA, spread_southfork, spread_uppermainstem, spring_nps, spring_tss, threechannel_NA, upperbarbc_NA, white_NA, willow_NA

63 Collections: bailey_NA, blackrock_lower, blackrock_upper, blacktail_NA, blindbull_NA, bluecrane_ford, boulder_NA, box_NA, cabin_NA, clear_NA, cliff_NA, coburn_NA, cody_moment, cottonwood_grosventre, cottonwood_nps, cowboycabin_NA, cowboycabin_springchannels, crystal_lower, crystal_upper, deadmansbar_NA, dell_NA, ditch_NA, dog_NA, fall_lower, fall_upper, fish_grosventre, fish_lower, fish_upper, flagstaff_NA, flat_NA, fordspring_NA, goosewing_NA, granite_lower, granite_upper, grosventre_lower, hoback_upper, horse_NA, lava_NA, leidy_NA, littlegreys_lower, littlegreys_upper, lowerbarbc_NA, mosquito_NA, northbuffalofork_NA, deadman_greys, pacific_lower, pacific_upper, rock_NA, shoal_NA, slate_lower, slate_upper, snakeriversidechannel_NA, spread_northfork, spread_southfork, spread_uppermainstem, spring_nps, spring_tss, steer_NA, threechannel_eastfork, threechannel_westfork, upperbarbc_NA, white_NA, willow_NA

6.25% of allelic data identified as missing
Code
self_test_exam <- self_test %>% group_by(indiv) %>% top_n(1, log_likelihood) %>% ungroup()

3.4.2 Inspect z-scores

Z-scores provide an indication of the likelihood that an individual belongs to a reporting group other than that to which it was assigned. Because we are performing self-assignment tests, we would not expect to see any major outliers.

Code
p1 <- ggplot(self_test_exam, aes(x = repunit, y = z_score)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
print(p1)

Distribution of z-scores by population

3.4.3 Self-assignment likelihood

Do individuals generally assign to their source reporting unit with high assignment likelihood? Expectation is the mean/median should be near 1, anything much lower would be cause for concern.

Code
# Distribution of assignment likelihood
#| fig-cap: "Distribution of indvidual assignment likelihoods."
p1 <- hist(self_test_exam$scaled_likelihood) # probability of assignment to a single reporting group is very high for nearly all individuals

Code
plot(p1, main = paste(paste("Mean = ", round(mean(self_test_exam$scaled_likelihood), digits = 3), "; Median = ", round(median(self_test_exam$scaled_likelihood), digits = 3), sep = "")))

3.4.4 Accuracy and efficiency

Inspect self-assignment accuracy and efficiency by reporting group.

  • Efficiency: the number of individuals known to belong to a reporting group which were recovered from the mixture.
  • Accuracy: the proportion of individuals correctly assigned to a reporting group

First, summarize results by reporting group:

Code
# summarize results by reporting group
ref_pops <- unique(self_test_exam$repunit)
prop_assigned_acc <- rep(NA, length(ref_pops))
prop_assigned_eff <- rep(NA, length(ref_pops))
sampsize <- rep(NA, length(ref_pops))
selfassprop <- tibble(ref_pops, prop_assigned_acc, prop_assigned_eff, sampsize)
for (i in 1:dim(selfassprop)[1]) {
  corass <- self_test_exam[which(self_test_exam$repunit == ref_pops[i]),] # fish known to be from a given reporting unit
  totass <- self_test_exam[which(self_test_exam$inferred_repunit == ref_pops[i]),] # fish assigned to a given reporting unit
  selfassprop$prop_assigned_eff[i] <- length(which(corass$repunit == corass$inferred_repunit)) / nrow(corass) # Efficiency - the number of individuals known to belong to a reporting group which were recovered from the mixture
  selfassprop$prop_assigned_acc[i] <- length(which(totass$repunit == totass$inferred_repunit)) / nrow(totass) # Accuracy - the proportion of individuals correctly assigned to a reporting group
  selfassprop$sampsize[i] <- nrow(corass)
}

# add coordinates for sorting
selfassprop <- selfassprop %>% left_join(gps, by = join_by(ref_pops == repunit)) 

Self-assignment efficiency, weighted mean:

Code
weighted.mean(selfassprop$prop_assigned_eff, selfassprop$sampsize/sum(selfassprop$sampsize), na.rm = T)
[1] 0.8239766

Self-assignment accuracy, weighted mean:

Code
weighted.mean(selfassprop$prop_assigned_acc, selfassprop$sampsize/sum(selfassprop$sampsize), na.rm = T)
[1] 0.8387791

How does reporting group accuracy and efficiency scale with sample size? Generally positively, but relationships with sample size are weak. I.e., even reporting groups with small sample sizes have relatively high accuracy/efficiency and thus low acc/eff are likely not a function of sample size.

Code
par(mar = c(4,4,1,1), mgp = c(2.5,1,0), mfrow = c(1,2))

plot(prop_assigned_eff ~ sampsize, selfassprop, xlab = "Sample Size", ylab = "Self-Assignment Efficiency")
legend("bottomright", legend = paste("Weighted Mean Eff. = ", round(weighted.mean(selfassprop$prop_assigned_eff, selfassprop$sampsize/sum(selfassprop$sampsize), na.rm = T), digits = 3)), bty = "n")

plot(prop_assigned_acc ~ sampsize, selfassprop, xlab = "Sample Size", ylab = "Self-Assignment Accuracy")
legend("bottomright", legend = paste("Weighted Mean Acc. = ", round(weighted.mean(selfassprop$prop_assigned_acc, selfassprop$sampsize/sum(selfassprop$sampsize), na.rm = T), digits = 3)), bty = "n")

View accuracy and efficiency by reporting group:

Code
selfassprop_gather <- selfassprop %>% gather(type, value, prop_assigned_acc:prop_assigned_eff)
selfassprop_gather %>% ggplot() + geom_bar(aes(x = ref_pops, y = value, fill = type), stat = "identity", position = "dodge") + 
  xlab("Reporting Group") + ylab("Accuracy or Efficiency") + theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), legend.position = "top") + 
  scale_fill_discrete(name = "", labels = c("Accuracy", "Efficiency"))

Reproduce the plot above, but only show accuracy and use numbered reporting groups, by latitude (for supplementary materials).

Code
selfassprop %>% ggplot() + geom_bar(aes(x = reorder(ref_pops, -lat), y = prop_assigned_acc), stat = "identity", position = "dodge", color = "black", fill = "grey") + 
  xlab("Reporting unit") + ylab("Self-assignment accuracy") + theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), legend.position = "top") + 
  scale_x_discrete(labels = c(1:52))

Write-out self-assignment test results

Code
write_csv(self_test_exam[,c(1:10)], file = "/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/Self Assignment/self_assign_results_individuals.csv")
write_csv(selfassprop, file = "/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/Self Assignment/self_assign_results_repgroups.csv")

3.4.5 Confusion matrix

Construct self-assignment confusion matrix: to which reporting units are individuals of known-origin being (mistakenly) assigned?

Code
conf_mat <- matrix(data = NA, nrow = length(unique(self_test_exam$repunit)), ncol = length(unique(self_test_exam$repunit)))
rownames(conf_mat) <- selfassprop$ref_pops
colnames(conf_mat) <- selfassprop$ref_pops
for (i in 1:nrow(conf_mat)){
  for (j in 1:ncol(conf_mat)){
    conf_mat[i,j] <- length(which(self_test_exam$repunit == rownames(conf_mat)[i] & self_test_exam$inferred_repunit == colnames(conf_mat)[j]))/length(which(self_test_exam$repunit == rownames(conf_mat)[i]))
  }
}
# check confusion matrix -- all rows should add to 1
# check <- apply(conf_mat, 1, sum)
# table(check == 1)
reactable(round(conf_mat, digits = 3))

Write out confusion matrix

Code
write.csv(conf_mat, file = "/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/Self Assignment/self_assign_confusion_matrix.csv", row.names = T, quote = F)

3.5 Simulated mixtures

Evaluate accuracy given our set of SNP markers and the grouping of collections into reporting groups

3.5.1 Run simulations

First, simulate mixtures and assign to reporting groups

Code
# usr_sims <- assess_reference_loo(reference = ref_input, gen_start_col = 5, reps = 500, mixsize = 1000, return_indiv_posteriors = TRUE)
# write_rds(usr_sims, "/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/Simulated Mixtures/sim_mix.rds")
usr_sims <- read_rds("/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/Simulated Mixtures/sim_mix.rds")

Number of simulations

Code
length(unique(usr_sims$mixing_proportions$iter))
[1] 500

3.5.2 Global results

Mean squared error:

Code
(sum((usr_sims$mixing_proportions$post_mean_pi - usr_sims$mixing_proportions$true_pi)^2) / length(unique(usr_sims$mixing_proportions$iter)))
[1] 0.003963695

Mean residual:

Code
(sum((usr_sims$mixing_proportions$post_mean_pi - usr_sims$mixing_proportions$true_pi)) / length(unique(usr_sims$mixing_proportions$iter))) 
[1] 1.401172e-17

Plot estimated proportional contribution against true contribution for all simulations and populations

Code
plot(post_mean_pi ~ true_pi, usr_sims$mixing_proportions, xlab = "True Mixing Proportion", ylab = "Posterior Mean Mixing Proportion")
abline(a = 0, b = 1, col = "red")
legend("topleft", legend = paste("MSE = ", round((sum((usr_sims$mixing_proportions$post_mean_pi - usr_sims$mixing_proportions$true_pi)^2) / length(unique(usr_sims$mixing_proportions$iter))), digits = 4), sep = ""), bty = "n")

3.5.3 By reporting group

Summarize by reporting group and write to file

Code
tmp0 <- usr_sims$mixing_proportions %>% 
  group_by(repunit) %>% 
  summarize(mse = sum((post_mean_pi - true_pi)^2) / length(unique(iter)),
            mr = sum((post_mean_pi - true_pi)) / length(unique(iter)),
            mr.sd = sd(post_mean_pi - true_pi),
            mr.025 = quantile(post_mean_pi - true_pi, 0.025),
            mr.975 = quantile(post_mean_pi - true_pi, 0.975)) %>%
  ungroup()
tmp0 <- tmp0 %>% left_join(gps) 
write_csv(tmp0, "/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/Simulated Mixtures/sim_mix_repgroup_msemr.csv")

Mean square error by reporting group

Code
tmp0 %>% ggplot() + 
  geom_bar(aes(x = repunit, y = mse), stat = "identity", fill = "grey70", color = "black") +
  xlab("Reporting Group") + ylab("Mean square error") + theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

Mean residual by reporting group

Code
tmp0 %>% ggplot() + 
  geom_bar(aes(x = repunit, y = mr), stat = "identity", fill = "grey70", color = "black") +
  xlab("Reporting Group") + ylab("Mean residual") + theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

Mean residual by reporting group (same as above), but numbered reporting groups with standard deviations

Code
tmp0 %>% ggplot(aes(x = reorder(repunit, -lat), y = mr)) + 
  geom_bar(stat = "identity", fill = "grey70", color = "black") +
  xlab("Reporting unit") + ylab("Mean residual") + theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
  scale_x_discrete(labels = c(1:52)) +
  geom_errorbar(aes(ymin = mr - mr.sd, ymax = mr + mr.sd), width = 0.4)

Plot estimated proportional contribution against true contribution for simulated mixtures for each reporting group:

Code
tmp <- usr_sims$mixing_proportions %>%
  group_by(iter, repunit) %>%
  summarise(true_repprop = sum(true_pi), 
            reprop_posterior_mean = sum(post_mean_pi),
            repu_n = sum(n)) %>%
  mutate(repu_n_prop = repu_n / sum(repu_n))

p <- ggplot(tmp, aes(x = true_repprop, y = reprop_posterior_mean, colour = repunit)) +
  geom_point(alpha = 0.2) +
  geom_abline(intercept = 0, slope = 1) +
  xlab("True Mixing Proportion") + ylab("Posterior Mean Mixing Proportion") + 
  facet_wrap_paginate(~ repunit, nrow = 4, ncol = 4) +
  guides(colour = FALSE)
# pdf("Baseline Data and Testing/Simulated Mixtures/sim_mix_repgroup_scatter.pdf", width = 8, height = 8)
for(i in 1:n_pages(p)) { print(p + facet_wrap_paginate(~ repunit, nrow = 4, ncol = 4, page = i)) }

3.5.4 Z-scores

Can we true the posteriors from individual fish? Plot distributions of z-scores by reporting group, over all simulations

Code
# first summarize by reporting unit
repu_pofzs <- usr_sims$indiv_posteriors %>%
  filter(repunit == simulated_repunit) %>%
  group_by(iter, indiv, simulated_repunit, repunit) %>%  # first aggregate over reporting units
  summarise(repu_PofZ = sum(PofZ)) %>%
  ungroup() %>%
  arrange(repunit, simulated_repunit) %>%
  mutate(simulated_repunit = factor(simulated_repunit, levels = unique(simulated_repunit)))
# also get the number of simulated individuals from each collection
num_simmed <- usr_sims$indiv_posteriors %>%
  group_by(iter, indiv) %>%
  slice(1) %>%
  ungroup() %>%
  count(simulated_repunit)
# now, plot it
ggplot(repu_pofzs, aes(x = simulated_repunit, y = repu_PofZ)) +
  geom_boxplot(aes(colour = repunit)) +
  geom_text(data = num_simmed, mapping = aes(y = 1.025, label = n), angle = 90, hjust = 0, vjust = 0.5, size = 3) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 9, vjust = 0.5)) +
  ylim(c(NA, 1.1)) +
  guides(colour = FALSE)

3.6 100% Simulations

As noted elsewhere, 100% simulations are used less frequently than in the past as assumptions are unrealistic (i.e., assume mixtures only made up of individuals from a single reporting group). However, we perform them here as they provide another diagnostic tool to assess the performance of our baseline.

Run the simulations and write output to file. 20 simulations per reporting group.

Code
# # create list specifying 100% mixtures
# hundy_scenarios <- lapply(unique(all_input$repunit)[!is.na(unique(all_input$repunit))], function(x) tibble(repunit = x, ppn = 1.0))
# names(hundy_scenarios) <- paste("All", unique(all_input$repunit)[!is.na(unique(all_input$repunit))], sep = "-")
# 
# # run the 100% simulations
# repu_hundy_results <- assess_reference_loo(reference = ref_input, gen_start_col = 5, reps = 20, mixsize = 500, alpha_repunit = hundy_scenarios)
# repu_hundy_results <- repu_hundy_results %>% mutate(repunit_true = substr(repunit_scenario, 5, 30))
# repu_hundy_results_sum <- repu_hundy_results %>% filter(repunit_true == repunit) %>% group_by(iter, repunit, repunit_true) %>% summarize(post_ppn = sum(post_mean_pi))
# repu_hundy_results_sum2 <- repu_hundy_results_sum %>% group_by(repunit_true) %>% summarize(post_ppn_mean = mean(post_ppn), post_ppn_sd = sd(post_ppn))
# mean(repu_hundy_results_sum2$post_ppn_mean)
# 
# write_csv(repu_hundy_results, "/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/Hundred Perc Sims/sim_mix_100perc_results_raw.csv")
# write_csv(repu_hundy_results_sum2, "/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/Hundred Perc Sims/sim_mix_100perc_results_summarized.csv")

repu_hundy_results_sum2 <- read_csv("/Users/jeffbaldock/Library/CloudStorage/GoogleDrive-jbaldock@uwyo.edu/Shared drives/wyo-coop-baldock/UWyoming/Snake River Cutthroat/Analyses/Snake River GSI Quarto/Baseline Testing/Hundred Perc Sims/sim_mix_100perc_results_summarized.csv")

Plot the results (mean and standard deviation per reporting group). Should be near 100%.

Code
repu_hundy_results_sum2 %>% ggplot() + 
  geom_bar(aes(x = repunit_true, y = post_ppn_mean), stat = "identity", fill = "grey70", color = "black") +
  geom_errorbar(aes(x = repunit_true, ymin = post_ppn_mean - post_ppn_sd, ymax = post_ppn_mean + post_ppn_sd), width = 0.4, color = "black") +
  xlab("Reporting Group") + ylab("100% Simulation Mixture Proportion") + theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))