4 Genomic data

4.1 Read duplicates

all_data %>%
    select(dataset,Extraction,duplicates,Taxon) %>%
    unique() %>%
    group_by(Taxon,Extraction) %>%
    summarise(value = sprintf("%.1f±%.1f", mean(duplicates), sd(duplicates))) %>%
    pivot_wider(names_from = Extraction, values_from = value) %>%
    tt(caption = "Mean and standard deviation of fraction of duplicated reads")
Mean and standard deviation of fraction of duplicated reads
Taxon REF DREX1 DREX2
Amphibian 0.3±0.2 0.2±0.2 0.2±0.2
Reptile 0.5±0.4 0.3±0.3 0.4±0.4
Mammal 0.4±0.4 0.2±0.1 0.2±0.2
Bird 0.8±0.3 0.9±0.1 0.6±0.4
Control 1.0±0.0 0.9±0.0 1.0±0.0
all_data %>%
    select(dataset,Extraction,duplicates,Taxon, Species) %>%
    mutate(duplicates=duplicates*100) %>%
    unique() %>%
    ggplot(aes(x=Extraction,y=duplicates, color=Species, group=Extraction)) + 
        scale_y_reverse() +
        geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") + 
        geom_jitter() + 
        scale_color_manual(values=vertebrate_colors) +
        facet_grid(. ~ Taxon, scales = "free") +
        theme_minimal() +
        labs(y="Duplication rate (%)",x="Extraction method")

data<-all_data  %>%
    select(dataset,Sample,Species,Extraction,duplicates,Taxon,catalogue) %>%
    filter(Taxon != "Control") 
M<-MASS::glmmPQL(duplicates~ Taxon+Extraction,random=~1|Species/Sample,
               family="quasibinomial",data=data)

Anova_table<-car::Anova(M,test.statistic="Wald",type="III")%>%
    mutate(F=Chisq/Df) # Approximate F value
n <- nrow(M$data)  # Total number of observations
df_fixed <- length(fixef(M))  # Number of fixed effect parameters
df_random <- length(ranef(M))  # Number of random effect parameters (or levels)
approx_residual_df <- n - df_fixed - df_random # Approx. residual Df
data.frame(Anova_table,
           p_value=pf(Anova_table$F, Anova_table$Df, approx_residual_df, lower.tail = FALSE))%>%
  select(F,p_value)
                   F      p_value
(Intercept) 3.382735 0.0705220597
Taxon       6.286319 0.0008325178
Extraction  2.590875 0.0827998947
broom.mixed::tidy(M) %>%
    tt()
effect term estimate std.error df statistic p.value
fixed (Intercept) -0.8529577 0.4843817 46 -1.7609205 0.084898541
fixed TaxonReptile 0.7968453 0.6107813 8 1.3046327 0.228295375
fixed TaxonMammal 0.2149855 0.6241809 8 0.3444282 0.739403356
fixed TaxonBird 2.3421470 0.6276001 8 3.7319100 0.005771913
fixed ExtractionDREX1 -0.4364219 0.3522111 46 -1.2390918 0.221596712
fixed ExtractionDREX2 -0.7829704 0.3612021 46 -2.1676796 0.035394104
plot_model(M,type="pred",terms=c("Taxon","Extraction"))

4.2 Depth of coverage

all_data %>%
    select(dataset,Extraction,coverage_depth,Taxon) %>%
    unique() %>%
    group_by(Taxon,Extraction) %>%
    summarise(value = sprintf("%.1f±%.1f", mean(coverage_depth), sd(coverage_depth))) %>%
    pivot_wider(names_from = Extraction, values_from = value) %>%
    tt(caption = "Mean and standard deviation of fraction of duplicated reads")
Mean and standard deviation of fraction of duplicated reads
Taxon REF DREX1 DREX2
Amphibian 0.0±0.0 0.0±0.0 0.0±0.0
Reptile 0.2±0.3 0.1±0.1 0.1±0.1
Mammal 0.7±1.2 0.3±0.4 0.5±1.0
Bird 0.4±0.8 0.6±0.5 0.8±0.8
Control 0.0±0.0 0.0±0.0 0.0±0.0
all_data %>%
    select(dataset,Extraction,coverage_depth,Taxon, Species) %>%
    unique() %>%
    ggplot(aes(x=Extraction,y=coverage_depth, color=Species, group=Extraction)) + 
        geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") + 
        geom_jitter() + 
        scale_color_manual(values=vertebrate_colors) +
        facet_grid(. ~ Taxon, scales = "free") +
        theme_minimal() +
        labs(y="Depth of coverage",x="Extraction method")

all_data %>%
    select(dataset,Sample,Species,Extraction,coverage_depth,Taxon,catalogue) %>%
    unique() %>%
    filter(Taxon != "Control") %>%
    lmerTest::lmer(rank(coverage_depth)~ Taxon + Extraction + (1 | Species/Sample), data = ., REML = TRUE) %>%
    plot()

all_data %>%
    select(dataset,Sample,Species,Extraction,coverage_depth,Taxon,catalogue) %>%
    unique() %>%
    filter(Taxon != "Control") %>%
    lmerTest::lmer(rank(coverage_depth)~ Taxon + Extraction + (1 | Species/Sample), data = ., REML = TRUE) %>%
    anova()
Type III Analysis of Variance Table with Satterthwaite's method
            Sum Sq Mean Sq NumDF DenDF F value  Pr(>F)  
Taxon      1802.51  600.84     3     8  6.8155 0.01355 *
Extraction    2.77    1.39     2    46  0.0157 0.98441  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
all_data %>%
    select(dataset,Sample,Species,Extraction,coverage_depth,Taxon,catalogue) %>%
    unique() %>%
    filter(Taxon != "Control") %>%
    lmerTest::lmer(rank(coverage_depth)~ Taxon + Extraction + (1 | Species/Sample), data = ., REML = TRUE) %>%
    plot_model(.,type="pred",terms=c("Taxon","Extraction"))

all_data %>%
    select(dataset,Sample,Species,Extraction,coverage_depth,Taxon,catalogue) %>%
    unique() %>%
    filter(Taxon != "Control") %>%
    lmerTest::lmer(rank(coverage_depth)~ Taxon + Extraction + (1 | Species/Sample), data = ., REML = TRUE) %>%
    r.squaredGLMM()
           R2m       R2c
[1,] 0.5280316 0.8163076

4.3 Breadth of coverage

all_data %>%
    select(dataset,Extraction,coverage_breadth,Taxon) %>%
    unique() %>%
    group_by(Taxon,Extraction) %>%
    summarise(value = sprintf("%.1f±%.1f", mean(coverage_breadth), sd(coverage_breadth))) %>%
    pivot_wider(names_from = Extraction, values_from = value) %>%
    tt(caption = "Mean and standard deviation of depth of host genome coverage")
Mean and standard deviation of depth of host genome coverage
Taxon REF DREX1 DREX2
Amphibian 0.0±0.0 0.0±0.0 0.0±0.0
Reptile 4.9±7.5 3.0±5.4 2.9±3.7
Mammal 5.7±5.9 10.2±16.4 15.1±26.4
Bird 0.6±0.5 3.2±4.4 8.9±13.9
Control 0.0±0.0 0.0±0.0 0.0±0.0
all_data %>%
    select(dataset,Extraction,coverage_breadth,Taxon,Species) %>%
    unique() %>%
    ggplot(aes(x=Extraction,y=coverage_breadth, color=Species, group=Extraction)) + 
        geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") + 
        geom_jitter() + 
        scale_color_manual(values=vertebrate_colors) +
        facet_grid(. ~ Taxon, scales = "free") +
        theme_minimal() +
        labs(y="Breadth of coverage (%)",x="Extraction method")

data<-all_data  %>%
    select(dataset,Extraction,Sample,Species,coverage_breadth,Taxon,catalogue) %>%
    ## Since amphibians contain only 0s, the small constant value avoid extremely large confidence intervals for that group
    mutate(coverage_breadth=(coverage_breadth/100)+0.001)%>%
    filter(Taxon != "Control") # %>%filter(catalogue != "All")

M<-MASS::glmmPQL(coverage_breadth~ Taxon+Extraction,random=~1|Species/Sample,
               family="quasibinomial",data=data)

Anova_table<-car::Anova(M,test.statistic="Wald",type="III")%>%
    mutate(F=Chisq/Df) # Approximate F value
n <- nrow(M$data)  # Total number of observations
df_fixed <- length(fixef(M))  # Number of fixed effect parameters
df_random <- length(ranef(M))  # Number of random effect parameters (or levels)
approx_residual_df <- n - df_fixed - df_random # Approx. residual Df
data.frame(Anova_table,
           p_value=pf(Anova_table$F, Anova_table$Df, approx_residual_df, lower.tail = FALSE))%>%
  select(F,p_value)
                    F      p_value
(Intercept) 32.284463 3.517106e-07
Taxon        2.732762 5.093456e-02
Extraction   9.995537 1.668395e-04
broom.mixed::tidy(M) %>%
    tt()
effect term estimate std.error df statistic p.value
fixed (Intercept) -7.484102 1.3757426 46 -5.440045 1.981253e-06
fixed TaxonReptile 2.835852 1.5439563 8 1.836744 1.035612e-01
fixed TaxonMammal 4.032235 1.5174983 8 2.657160 2.893266e-02
fixed TaxonBird 3.502390 1.5192729 8 2.305306 5.005449e-02
fixed ExtractionDREX1 0.447717 0.2802892 46 1.597340 1.170377e-01
fixed ExtractionDREX2 1.074598 0.2620250 46 4.101129 1.658076e-04
plot_model(M,type="pred",terms=c("Taxon","Extraction"))

4.4 Breadth vs. coverage

all_data_log <- all_data %>%
    mutate(coverage_breadth_log=log(coverage_breadth)) %>%
    mutate(coverage_depth_log=log(coverage_depth)) 

lm_eq <- lm(coverage_breadth_log ~ coverage_depth_log, data = all_data_log %>% filter(coverage_depth_log != -Inf ,coverage_breadth_log != -Inf))
coef <- coef(lm_eq)
all_data_log$coverage_breadth_log_pred <- coef[1] + coef[2] * all_data_log$coverage_depth_log


all_data_log %>%
    select(dataset,Extraction,coverage_depth_log,coverage_breadth_log,coverage_breadth_log_pred,Taxon,Species,Sample) %>%
    unique() %>%
    ggplot(aes(x=coverage_depth_log,y=coverage_breadth_log)) + 
        geom_point(aes(color=Species, shape=Extraction),size=3, alpha=0.9) + 
        geom_segment(aes(x = coverage_depth_log, y = coverage_breadth_log, xend = coverage_depth_log, yend = coverage_breadth_log_pred, color=Species), alpha=0.9)+
        geom_smooth(method = lm, se = FALSE, color="#666666") +
        scale_color_manual(values=vertebrate_colors) +
        theme_minimal() +
        labs(y="Breadth of coverage (%)",x="Depth of coverage")