Read duplicates
all_data %>%
select(dataset,Extraction,duplicates,Taxon) %>%
unique() %>%
group_by(Taxon,Extraction) %>%
summarise(value = sprintf("%.1f±%.1f", mean(duplicates), sd(duplicates))) %>%
pivot_wider(names_from = Extraction, values_from = value) %>%
tt(caption = "Mean and standard deviation of fraction of duplicated reads")
Mean and standard deviation of fraction of duplicated reads
| Taxon |
REF |
DREX1 |
DREX2 |
| Amphibian |
0.3±0.2 |
0.2±0.2 |
0.2±0.2 |
| Reptile |
0.5±0.4 |
0.3±0.3 |
0.4±0.4 |
| Mammal |
0.4±0.4 |
0.2±0.1 |
0.2±0.2 |
| Bird |
0.8±0.3 |
0.9±0.1 |
0.6±0.4 |
| Control |
1.0±0.0 |
0.9±0.0 |
1.0±0.0 |
all_data %>%
select(dataset,Extraction,duplicates,Taxon, Species) %>%
mutate(duplicates=duplicates*100) %>%
unique() %>%
ggplot(aes(x=Extraction,y=duplicates, color=Species, group=Extraction)) +
scale_y_reverse() +
geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") +
geom_jitter() +
scale_color_manual(values=vertebrate_colors) +
facet_grid(. ~ Taxon, scales = "free") +
theme_minimal() +
labs(y="Duplication rate (%)",x="Extraction method")

data<-all_data %>%
select(dataset,Sample,Species,Extraction,duplicates,Taxon,catalogue) %>%
filter(Taxon != "Control")
M<-MASS::glmmPQL(duplicates~ Taxon+Extraction,random=~1|Species/Sample,
family="quasibinomial",data=data)
Anova_table<-car::Anova(M,test.statistic="Wald",type="III")%>%
mutate(F=Chisq/Df) # Approximate F value
n <- nrow(M$data) # Total number of observations
df_fixed <- length(fixef(M)) # Number of fixed effect parameters
df_random <- length(ranef(M)) # Number of random effect parameters (or levels)
approx_residual_df <- n - df_fixed - df_random # Approx. residual Df
data.frame(Anova_table,
p_value=pf(Anova_table$F, Anova_table$Df, approx_residual_df, lower.tail = FALSE))%>%
select(F,p_value)
F p_value
(Intercept) 3.382735 0.0705220597
Taxon 6.286319 0.0008325178
Extraction 2.590875 0.0827998947
broom.mixed::tidy(M) %>%
tt()
| effect |
term |
estimate |
std.error |
df |
statistic |
p.value |
| fixed |
(Intercept) |
-0.8529577 |
0.4843817 |
46 |
-1.7609205 |
0.084898541 |
| fixed |
TaxonReptile |
0.7968453 |
0.6107813 |
8 |
1.3046327 |
0.228295375 |
| fixed |
TaxonMammal |
0.2149855 |
0.6241809 |
8 |
0.3444282 |
0.739403356 |
| fixed |
TaxonBird |
2.3421470 |
0.6276001 |
8 |
3.7319100 |
0.005771913 |
| fixed |
ExtractionDREX1 |
-0.4364219 |
0.3522111 |
46 |
-1.2390918 |
0.221596712 |
| fixed |
ExtractionDREX2 |
-0.7829704 |
0.3612021 |
46 |
-2.1676796 |
0.035394104 |
plot_model(M,type="pred",terms=c("Taxon","Extraction"))

Depth of coverage
all_data %>%
select(dataset,Extraction,coverage_depth,Taxon) %>%
unique() %>%
group_by(Taxon,Extraction) %>%
summarise(value = sprintf("%.1f±%.1f", mean(coverage_depth), sd(coverage_depth))) %>%
pivot_wider(names_from = Extraction, values_from = value) %>%
tt(caption = "Mean and standard deviation of fraction of duplicated reads")
Mean and standard deviation of fraction of duplicated reads
| Taxon |
REF |
DREX1 |
DREX2 |
| Amphibian |
0.0±0.0 |
0.0±0.0 |
0.0±0.0 |
| Reptile |
0.2±0.3 |
0.1±0.1 |
0.1±0.1 |
| Mammal |
0.7±1.2 |
0.3±0.4 |
0.5±1.0 |
| Bird |
0.4±0.8 |
0.6±0.5 |
0.8±0.8 |
| Control |
0.0±0.0 |
0.0±0.0 |
0.0±0.0 |
all_data %>%
select(dataset,Extraction,coverage_depth,Taxon, Species) %>%
unique() %>%
ggplot(aes(x=Extraction,y=coverage_depth, color=Species, group=Extraction)) +
geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") +
geom_jitter() +
scale_color_manual(values=vertebrate_colors) +
facet_grid(. ~ Taxon, scales = "free") +
theme_minimal() +
labs(y="Depth of coverage",x="Extraction method")

all_data %>%
select(dataset,Sample,Species,Extraction,coverage_depth,Taxon,catalogue) %>%
unique() %>%
filter(Taxon != "Control") %>%
lmerTest::lmer(rank(coverage_depth)~ Taxon + Extraction + (1 | Species/Sample), data = ., REML = TRUE) %>%
plot()

all_data %>%
select(dataset,Sample,Species,Extraction,coverage_depth,Taxon,catalogue) %>%
unique() %>%
filter(Taxon != "Control") %>%
lmerTest::lmer(rank(coverage_depth)~ Taxon + Extraction + (1 | Species/Sample), data = ., REML = TRUE) %>%
anova()
Type III Analysis of Variance Table with Satterthwaite's method
Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
Taxon 1802.51 600.84 3 8 6.8155 0.01355 *
Extraction 2.77 1.39 2 46 0.0157 0.98441
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
all_data %>%
select(dataset,Sample,Species,Extraction,coverage_depth,Taxon,catalogue) %>%
unique() %>%
filter(Taxon != "Control") %>%
lmerTest::lmer(rank(coverage_depth)~ Taxon + Extraction + (1 | Species/Sample), data = ., REML = TRUE) %>%
plot_model(.,type="pred",terms=c("Taxon","Extraction"))

all_data %>%
select(dataset,Sample,Species,Extraction,coverage_depth,Taxon,catalogue) %>%
unique() %>%
filter(Taxon != "Control") %>%
lmerTest::lmer(rank(coverage_depth)~ Taxon + Extraction + (1 | Species/Sample), data = ., REML = TRUE) %>%
r.squaredGLMM()
R2m R2c
[1,] 0.5280316 0.8163076
Breadth of coverage
all_data %>%
select(dataset,Extraction,coverage_breadth,Taxon) %>%
unique() %>%
group_by(Taxon,Extraction) %>%
summarise(value = sprintf("%.1f±%.1f", mean(coverage_breadth), sd(coverage_breadth))) %>%
pivot_wider(names_from = Extraction, values_from = value) %>%
tt(caption = "Mean and standard deviation of depth of host genome coverage")
Mean and standard deviation of depth of host genome coverage
| Taxon |
REF |
DREX1 |
DREX2 |
| Amphibian |
0.0±0.0 |
0.0±0.0 |
0.0±0.0 |
| Reptile |
4.9±7.5 |
3.0±5.4 |
2.9±3.7 |
| Mammal |
5.7±5.9 |
10.2±16.4 |
15.1±26.4 |
| Bird |
0.6±0.5 |
3.2±4.4 |
8.9±13.9 |
| Control |
0.0±0.0 |
0.0±0.0 |
0.0±0.0 |
all_data %>%
select(dataset,Extraction,coverage_breadth,Taxon,Species) %>%
unique() %>%
ggplot(aes(x=Extraction,y=coverage_breadth, color=Species, group=Extraction)) +
geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") +
geom_jitter() +
scale_color_manual(values=vertebrate_colors) +
facet_grid(. ~ Taxon, scales = "free") +
theme_minimal() +
labs(y="Breadth of coverage (%)",x="Extraction method")

data<-all_data %>%
select(dataset,Extraction,Sample,Species,coverage_breadth,Taxon,catalogue) %>%
## Since amphibians contain only 0s, the small constant value avoid extremely large confidence intervals for that group
mutate(coverage_breadth=(coverage_breadth/100)+0.001)%>%
filter(Taxon != "Control") # %>%filter(catalogue != "All")
M<-MASS::glmmPQL(coverage_breadth~ Taxon+Extraction,random=~1|Species/Sample,
family="quasibinomial",data=data)
Anova_table<-car::Anova(M,test.statistic="Wald",type="III")%>%
mutate(F=Chisq/Df) # Approximate F value
n <- nrow(M$data) # Total number of observations
df_fixed <- length(fixef(M)) # Number of fixed effect parameters
df_random <- length(ranef(M)) # Number of random effect parameters (or levels)
approx_residual_df <- n - df_fixed - df_random # Approx. residual Df
data.frame(Anova_table,
p_value=pf(Anova_table$F, Anova_table$Df, approx_residual_df, lower.tail = FALSE))%>%
select(F,p_value)
F p_value
(Intercept) 32.284463 3.517106e-07
Taxon 2.732762 5.093456e-02
Extraction 9.995537 1.668395e-04
broom.mixed::tidy(M) %>%
tt()
| effect |
term |
estimate |
std.error |
df |
statistic |
p.value |
| fixed |
(Intercept) |
-7.484102 |
1.3757426 |
46 |
-5.440045 |
1.981253e-06 |
| fixed |
TaxonReptile |
2.835852 |
1.5439563 |
8 |
1.836744 |
1.035612e-01 |
| fixed |
TaxonMammal |
4.032235 |
1.5174983 |
8 |
2.657160 |
2.893266e-02 |
| fixed |
TaxonBird |
3.502390 |
1.5192729 |
8 |
2.305306 |
5.005449e-02 |
| fixed |
ExtractionDREX1 |
0.447717 |
0.2802892 |
46 |
1.597340 |
1.170377e-01 |
| fixed |
ExtractionDREX2 |
1.074598 |
0.2620250 |
46 |
4.101129 |
1.658076e-04 |
plot_model(M,type="pred",terms=c("Taxon","Extraction"))

Breadth vs. coverage
all_data_log <- all_data %>%
mutate(coverage_breadth_log=log(coverage_breadth)) %>%
mutate(coverage_depth_log=log(coverage_depth))
lm_eq <- lm(coverage_breadth_log ~ coverage_depth_log, data = all_data_log %>% filter(coverage_depth_log != -Inf ,coverage_breadth_log != -Inf))
coef <- coef(lm_eq)
all_data_log$coverage_breadth_log_pred <- coef[1] + coef[2] * all_data_log$coverage_depth_log
all_data_log %>%
select(dataset,Extraction,coverage_depth_log,coverage_breadth_log,coverage_breadth_log_pred,Taxon,Species,Sample) %>%
unique() %>%
ggplot(aes(x=coverage_depth_log,y=coverage_breadth_log)) +
geom_point(aes(color=Species, shape=Extraction),size=3, alpha=0.9) +
geom_segment(aes(x = coverage_depth_log, y = coverage_breadth_log, xend = coverage_depth_log, yend = coverage_breadth_log_pred, color=Species), alpha=0.9)+
geom_smooth(method = lm, se = FALSE, color="#666666") +
scale_color_manual(values=vertebrate_colors) +
theme_minimal() +
labs(y="Breadth of coverage (%)",x="Depth of coverage")
