4 Laboratory sample processing

4.1 DNA extraction

4.1.1 General statistics

read_tsv("data/extraction.tsv") %>%
   summarise(
    max= max(extraction_total, na.rm = TRUE),
    min= min(extraction_total, na.rm = TRUE),
    mean= mean(extraction_total, na.rm = TRUE),
    sd = sd(extraction_total, na.rm = TRUE),
    median=median(extraction_total, na.rm = TRUE),
    IQR=IQR(extraction_total, na.rm = TRUE)
  ) %>%
  tt()
tinytable_vxly1u1b0ubs114o5h56
max min mean sd median IQR
7110 0 365.683 677.4012 100.35 425.7

4.1.1.1 Data distribution

4.1.2 Sample types

left_join(read_tsv("data/extraction.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>% 
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab"))  %>%
  ggplot(aes(x=sample_type, y= extraction_total, fill=sample_type, color=sample_type)) + 
    ylim(0, 2000) +
    geom_jitter(position = position_jitter(width = 0.2), alpha = 0.5, size=0.5) +
    stat_halfeye(adjust = 0.5,width = 0.5, .width = 0, justification = -.55,normalize = "groups") +
    scale_color_manual(values = c("#bdca50", "#AA3377")) +   
    scale_fill_manual(values = c("#bdca5050", "#AA337750")) +
    labs(y="Density",x="DNA yield", fill="Sample type", color="Sample type") +
    theme_classic()

ggsave("figures/extraction_type.pdf",width=6, height=4, units="in")

left_join(read_tsv("data/extraction.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>% 
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  group_by(sample_type)  %>%
  summarise(
    mean= mean(extraction_total, na.rm = TRUE),
    sd = sd(extraction_total, na.rm = TRUE),
    median=median(extraction_total, na.rm = TRUE),
    IQR=IQR(extraction_total, na.rm = TRUE)) %>%
  tt()
tinytable_v324drtq5gq4g42n2qdv
sample_type mean sd median IQR
Anal/cloacal swab 170.2285 362.7389 25.335 155.7743
Faecal 316.6666 423.6434 129.375 430.8075

4.1.3 Taxonomy

left_join(read_tsv("data/extraction.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>% 
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab"))  %>%
  mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
  ggplot(aes(y=extraction_total,x=tax_group,color=tax_group,fill=tax_group)) + 
    ylim(0, 2000) +
    geom_jitter(position = position_jitter(width = 0.2), alpha = 0.5, size=0.5) +
    stat_halfeye(adjust = 0.5,width = 0.5, .width = 0, justification = -.55,normalize = "groups") +
    scale_color_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
    scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
    labs(y="DNA yield",x="Taxonomic group", color="Sample type") +
    theme_classic()

ggsave("figures/extraction_taxa.pdf",width=9, height=4, units="in")

left_join(read_tsv("data/extraction.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>% 
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  group_by(tax_group)  %>%
  summarise(
    mean= mean(extraction_total, na.rm = TRUE),
    sd = sd(extraction_total, na.rm = TRUE),
    median=median(extraction_total, na.rm = TRUE),
    IQR=IQR(extraction_total, na.rm = TRUE)) %>%
  tt()
tinytable_fuclxksxf6xzbf7qbtkt
tax_group mean sd median IQR
Amphibians 233.35078 358.90219 128.250 137.2500
Bats 59.60574 92.00357 30.735 63.4950
Birds 74.43550 262.34517 7.380 27.3555
Mammals 463.93202 471.41487 321.975 563.6250
Reptiles 394.50619 437.86229 249.750 541.3500

4.1.3.1 Comparison

left_join(read_tsv("data/extraction.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>% 
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  dplyr::select(extraction_total,sample_type,tax_group) %>%
  mutate(sample_type = factor(sample_type),tax_group=factor(tax_group)) %>%
  lm(rank(extraction_total) ~ tax_group+sample_type,data=.) %>%
  Anova(.,type="III")%>%
  tidy()%>%
  tt()
tinytable_jjtals3j5fpqdaupzuqa
term sumsq df statistic p.value
(Intercept) 182499974 1 556.65758 2.404646e-111
tax_group 413589974 4 315.38086 4.198853e-220
sample_type 8154841 1 24.87373 6.538535e-07
Residuals 824213930 2514 NA NA
left_join(read_tsv("data/extraction.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>% 
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  dplyr::select(extraction_total,sample_type,tax_group) %>%
  mutate(sample_type = factor(sample_type),tax_group=factor(tax_group)) %>%
  lm(rank(extraction_total) ~ tax_group+sample_type,data=.) %>%
  glht(.,linfct = mcp(tax_group = "Tukey"))%>%
  summary()%>%
  tidy()%>%
  tt()
tinytable_oqqhi7tm74sin26q3f7o
term contrast null.value estimate std.error statistic adj.p.value
tax_group Bats - Amphibians 0 -559.4931 52.02136 -10.755065 0.000000e+00
tax_group Birds - Amphibians 0 -725.2098 53.12799 -13.650239 0.000000e+00
tax_group Mammals - Amphibians 0 311.3585 48.13741 6.468118 1.462295e-09
tax_group Reptiles - Amphibians 0 107.1624 49.38816 2.169799 1.848433e-01
tax_group Birds - Bats 0 -165.7167 41.15805 -4.026349 5.689658e-04
tax_group Mammals - Bats 0 870.8516 34.96158 24.908815 0.000000e+00
tax_group Reptiles - Bats 0 666.6555 36.79145 18.119849 0.000000e+00
tax_group Mammals - Birds 0 1036.5683 34.63845 29.925364 0.000000e+00
tax_group Reptiles - Birds 0 832.3722 38.06934 21.864631 0.000000e+00
tax_group Reptiles - Mammals 0 -204.1961 30.99605 -6.587811 6.917859e-10

4.2 Sequencing library preparation

4.2.1 Overall

left_join(read_tsv("data/library.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>% 
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  filter(library_PCR_cycles_required > 0) %>%
   summarise(
    max= max(library_input_dna, na.rm = TRUE),
    min= min(library_input_dna, na.rm = TRUE),
    mean= mean(library_input_dna, na.rm = TRUE),
    sd = sd(library_input_dna, na.rm = TRUE),
    median=median(library_input_dna, na.rm = TRUE),
    IQR=IQR(library_input_dna, na.rm = TRUE)) %>%
  tt()
tinytable_zll5helqranro2i6rlxk
max min mean sd median IQR
552 0 89.34565 84.03881 57.6 180.006
left_join(read_tsv("data/library.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>% 
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  filter(library_PCR_cycles_required > 0) %>%
   summarise(
    max= max(library_PCR_cycles_required, na.rm = TRUE),
    min= min(library_PCR_cycles_required, na.rm = TRUE),
    mean= mean(library_PCR_cycles_required, na.rm = TRUE),
    sd = sd(library_PCR_cycles_required, na.rm = TRUE),
    median=median(library_PCR_cycles_required, na.rm = TRUE),
    IQR=IQR(library_PCR_cycles_required, na.rm = TRUE)) %>%
  tt()
tinytable_9iy7qey6deztx7l67aec
max min mean sd median IQR
33 2 9.150046 4.390537 8 6
left_join(read_tsv("data/library.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>% 
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  filter(library_PCR_cycles_required > 0) %>%
  filter(library_input_dna > 0) %>%
  mutate(library_PCR_cycles_required=round(library_PCR_cycles_required))%>%
  glm(library_PCR_cycles_required ~ tax_group*log(library_input_dna)+sample_type*log(library_input_dna), data = .,family=poisson())  %>%
  Anova(.,test.statistic="Wald")
Analysis of Deviance Table (Type II tests)

Response: library_PCR_cycles_required
                                   Df    Chisq Pr(>Chisq)    
tax_group                           4  83.0304  < 2.2e-16 ***
log(library_input_dna)              1 782.5648  < 2.2e-16 ***
sample_type                         1  27.0760  1.956e-07 ***
tax_group:log(library_input_dna)    4  69.2767  3.226e-14 ***
log(library_input_dna):sample_type  1   4.2027    0.04036 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
left_join(read_tsv("data/library.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>% 
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  filter(library_PCR_cycles_required > 0) %>%
  filter(library_input_dna > 0) %>%
  filter(library_input_dna < 200) %>%
  mutate(library_PCR_cycles_required=round(library_PCR_cycles_required))%>%
  glm(library_PCR_cycles_required ~ tax_group*log(library_input_dna)+sample_type*log(library_input_dna), data = .,family=poisson())  %>%
  plot_model(.,type="pred",terms=c("library_input_dna[1:200]","sample_type"),show.data = TRUE,dot.size = 1,line.size = 1)+
  scale_color_manual(values = c("#bdca50", "#AA3377")) +
  scale_fill_manual(values = c("#bdca5050", "#AA337750")) +
  labs(y="Required number of cycles",x="Amount of inputted DNA (ng)", color="Sample type") +
  theme_classic()

ggsave("figures/cycles_type.pdf",width=6, height=3, units="in")

left_join(read_tsv("data/library.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>% 
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  filter(library_PCR_cycles_required > 0) %>%
  filter(library_input_dna > 0) %>%
  filter(library_input_dna < 200) %>%
  mutate(library_PCR_cycles_required=round(library_PCR_cycles_required))%>%
  mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
  glm(library_PCR_cycles_required ~ tax_group*log(library_input_dna)+sample_type*log(library_input_dna), data = .,family=poisson())  %>%
  plot_model(.,type="pred",terms=c("library_input_dna [1:200]","tax_group"),show.data = TRUE,dot.size = 1,line.size = 1)+
  scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        labs(y="Required number of cycles",x="Amount of inputted DNA (ng)", color="Sample type", fill="Sample type") +
  labs(y="Required number of cycles",x="Amount of inputted DNA (ng)", color="Taxonomic group") +
  theme_classic()

ggsave("figures/cycles_taxa.pdf",width=6, height=3, units="in")

4.3 Data quality

left_join(read_tsv("data/preprocessing.tsv"),read_tsv("data/sample.tsv"),by="sample_id") %>%
  left_join(read_tsv("data/sequence.tsv"),by="sequence_id") %>%
  left_join(read_tsv("data/index.tsv"),by="index_id") %>%
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  mutate(lowqual_perc_reads=(1-reads_post_fastp/reads_pre_fastp)*100) %>%
  mutate(lowqual_perc_bases=(1-bases_post_fastp/bases_pre_fastp)*100) %>%
  #Plot map  EE6677 < bats
  ggplot(aes(y=lowqual_perc_bases, x=index_PCR_cycles_given, colour=sample_type, fill=sample_type, group=sample_type)) +
    geom_jitter(alpha=0.3) +
    stat_smooth(method = "gam", formula = y ~ s(x, bs = "ps",k=4),se=FALSE, geom = "smooth", alpha=0.2) +
    scale_color_manual(values = c("#bdca50", "#AA3377")) +   
    scale_fill_manual(values = c("#bdca5080", "#AA337780")) +
    scale_x_log10() +   
    theme_classic() +
    theme(legend.position = "bottom")

left_join(read_tsv("data/preprocessing.tsv"),read_tsv("data/sample.tsv"),by="sample_id") %>%
  left_join(read_tsv("data/sequence.tsv"),by="sequence_id") %>%
  left_join(read_tsv("data/index.tsv"),by="index_id") %>%
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  mutate(lowqual_perc_reads=(1-reads_post_fastp/reads_pre_fastp)*100) %>%
  mutate(lowqual_perc_bases=(1-bases_post_fastp/bases_pre_fastp)*100) %>%
  mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
  ggplot(aes(y=lowqual_perc_bases, x=index_PCR_cycles_given, colour=tax_group, fill=tax_group, group=tax_group)) +
    geom_jitter(alpha=0.3) +
    stat_smooth(method = "gam", formula = y ~ s(x, bs = "ps",k=4),se=FALSE, geom = "smooth", alpha=0.2) +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
    scale_x_log10() +   
    theme_classic() +
    theme(legend.position = "bottom")

4.4 Data duplication

left_join(read_tsv("data/preprocessing.tsv"),read_tsv("data/sample.tsv"),by="sample_id") %>%
  left_join(read_tsv("data/sequence.tsv"),by="sequence_id") %>%
  left_join(read_tsv("data/index.tsv"),by="index_id") %>%
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  mutate(lowqual_perc_reads=(1-reads_post_fastp/reads_pre_fastp)*100) %>%
  mutate(lowqual_perc_bases=(1-bases_post_fastp/bases_pre_fastp)*100) %>%
  #Plot map  EE6677 < bats
  ggplot(aes(y=host_duplicate_fraction, x=index_PCR_cycles_given, colour=sample_type, fill=sample_type, group=sample_type)) +
    geom_jitter(alpha=0.3) +
    stat_smooth(method = "gam", formula = y ~ s(x, bs = "ps",k=4),se=FALSE, geom = "smooth", alpha=0.2) +
    scale_color_manual(values = c("#bdca50", "#AA3377")) +   
    scale_fill_manual(values = c("#bdca5080", "#AA337780")) +
    scale_x_log10() +   
    theme_classic() +
    theme(legend.position = "bottom")

left_join(read_tsv("data/preprocessing.tsv"),read_tsv("data/sample.tsv"),by="sample_id") %>%
  left_join(read_tsv("data/sequence.tsv"),by="sequence_id") %>%
  left_join(read_tsv("data/index.tsv"),by="index_id") %>%
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  mutate(lowqual_perc_reads=(1-reads_post_fastp/reads_pre_fastp)*100) %>%
  mutate(lowqual_perc_bases=(1-bases_post_fastp/bases_pre_fastp)*100) %>%
  mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
  ggplot(aes(y=host_duplicate_fraction, x=index_PCR_cycles_given, colour=tax_group, fill=tax_group, group=tax_group)) +
    geom_jitter(alpha=0.3) +
    stat_smooth(method = "gam", formula = y ~ s(x, bs = "ps",k=4),se=FALSE, geom = "smooth", alpha=0.2) +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
    scale_x_log10() +   
    theme_classic() +
    theme(legend.position = "bottom")