5 Metagenomic data

5.1 Library complexity

Nonpareil estimate of the metagenomic complexity after removing host DNA.

all_data %>%
    select(dataset,Extraction,nonpareil,Taxon) %>%
    unique() %>%
    group_by(Taxon,Extraction) %>%
    summarise(value = sprintf("%.1f±%.1f", mean(nonpareil), sd(nonpareil))) %>%
    pivot_wider(names_from = Extraction, values_from = value) %>%
    tt(caption = "Mean and standard deviation of breadth of host genome coverage")

Mean and standard deviation of breadth of host genome coverage
Taxon	REF	DREX1	DREX2
Amphibian	0.9±0.1	0.8±0.1	0.8±0.1
Reptile	0.9±0.1	0.9±0.0	0.9±0.1
Mammal	0.8±0.2	0.8±0.1	0.7±0.3
Bird	0.9±0.1	1.0±0.0	0.8±0.4
Control	0.0±0.0	0.5±0.6	0.5±0.7

all_data %>%
    select(dataset,Extraction,nonpareil,Taxon,Species) %>%
    unique() %>%
    ggplot(aes(x=Extraction,y=nonpareil, color=Species, group=Extraction)) + 
        geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") + 
        geom_jitter() + 
        scale_color_manual(values=vertebrate_colors) +
        facet_grid(. ~ Taxon, scales = "free") +
        theme_minimal() +
        labs(y="Nonpareil completeness",x="Extraction method")

all_data  %>%
    select(dataset,Extraction,Sample,Species,nonpareil,Taxon,catalogue) %>%
    unique() %>%
    filter(Taxon != "Control") %>%
    #filter(catalogue != "All") %>%
    MASS::glmmPQL(nonpareil~ Taxon+Extraction,random=~1|Species/Sample,
               family="quasibinomial",data=.) %>%
    car::Anova(.,test.statistic="Wald")

Analysis of Deviance Table (Type II tests)

Response: zz
             Chisq Df Pr(>Chisq)    
Taxon       9.1822  3  0.0269646 *  
Extraction 14.2287  2  0.0008133 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

data<-all_data  %>%
    select(dataset,Extraction,Sample,Species,nonpareil,Taxon,catalogue) %>%
    unique() %>%
    filter(Taxon != "Control") #%>% filter(catalogue != "All")
M<-MASS::glmmPQL(nonpareil~ Taxon+Extraction,random=~1|Species/Sample,
               family="quasibinomial",data=data)

Anova_table<-car::Anova(M,test.statistic="Wald",type="III")%>%
    mutate(F=Chisq/Df) # Approximate F value
n <- nrow(M$data)  # Total number of observations
df_fixed <- length(fixef(M))  # Number of fixed effect parameters
df_random <- length(ranef(M))  # Number of random effect parameters (or levels)
approx_residual_df <- n - df_fixed - df_random # Approx. residual Df
data.frame(Anova_table,
           p_value=pf(Anova_table$F, Anova_table$Df, approx_residual_df, lower.tail = FALSE))%>%
  select(F,p_value)

                    F      p_value
(Intercept) 34.129537 1.883942e-07
Taxon        3.060717 3.438361e-02
Extraction   7.114372 1.621943e-03

broom.mixed::tidy(M) %>%
    tt()

effect	term	estimate	std.error	df	statistic	p.value
fixed	(Intercept)	2.0341583	0.3636753	46	5.5933361	1.173059e-06
fixed	TaxonReptile	0.5969719	0.4633445	8	1.2883977	2.336181e-01
fixed	TaxonMammal	-0.5520372	0.4122472	8	-1.3390926	2.173392e-01
fixed	TaxonBird	0.4335768	0.4524258	8	0.9583381	3.659548e-01
fixed	ExtractionDREX1	0.0578808	0.3338911	46	0.1733523	8.631351e-01
fixed	ExtractionDREX2	-0.8572831	0.2948977	46	-2.9070523	5.597154e-03

plot_model(M,type="pred",terms=c("Taxon","Extraction"))

VarCorr(M)

            Variance     StdDev      
Species =   pdLogChol(1)             
(Intercept) 2.270973e-09 4.765473e-05
Sample =    pdLogChol(1)             
(Intercept) 1.978035e-01 4.447510e-01
Residual    1.172704e-01 3.424477e-01

# Within-taxon between-species variance: 0%
4*(2.092129e-09/(2.092129e-09+1.658436e-01))

[1] 5.046029e-08

# Within-species between-sample variance: 4%
4*(1.658436e-01/(2.092129e-09+1.658436e-01))

[1] 4

EHEX obtained lower nonpareil completeness values than REF and DREX. Lowest nonpareil completeness values were ontained for mammals and higest for reptiles.

5.2 Combined community analysis

species="combined"
genus=species

sample_metadata <- read_tsv(paste0("data/metadata.tsv")) %>%
    rename(dataset=Dataset)

read_counts <- read_tsv("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134_counts.tsv.gz") %>%
  rename(genome = 1)

genome_metadata <- read_tsv("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134_mag_info.tsv.gz") %>%
  rename(genome = 1, length=mag_size)

genome_coverage <- read_tsv("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134_coverage.tsv.gz") %>%
  rename(genome = 1)

download.file("https://sid.erda.dk/share_redirect/BaMZodj9sA/DMB/DMB0134/DMB0134.tree.gz", "data/DMB0134.tree.gz")
genome_tree <- read_tree("data/DMB0134.tree.gz")

5.2.1 Filter data

#Filter by coverage
min_coverage=0.3
read_counts_filt <- genome_coverage %>%
  mutate(across(where(is.numeric), ~ ifelse(. > min_coverage, 1, 0))) %>%
  mutate(across(-1, ~ . * read_counts[[cur_column()]]))

# Transform  to genome counts (non-filtered)
readlength=150
genome_counts <- read_counts %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))

# Transform to genome counts (coverage-filtered)
readlength=150
genome_counts_filt <- read_counts_filt %>%
  mutate(across(where(is.numeric), ~ . / (genome_metadata$length / readlength) ))

5.2.2 Alpha diversity

#Calculate Hill numbers
richness <- genome_counts_filt %>%
            column_to_rownames(var="genome") %>%
            select(where(~!all(. == 0))) %>%
            hilldiv(.,q=0) %>%
            t() %>%
            as.data.frame() %>%
            rename(richness=1) %>%
            rownames_to_column(var="dataset")

neutral <- genome_counts_filt %>%
            column_to_rownames(var="genome") %>%
            select(where(~!all(. == 0))) %>%
            hilldiv(.,q=1) %>%
            t() %>%
            as.data.frame() %>%
            rename(neutral=1) %>%
            rownames_to_column(var="dataset")

phylogenetic <- genome_counts_filt %>%
            column_to_rownames(var="genome") %>%
            select(where(~!all(. == 0))) %>%
            hilldiv(.,q=1,tree=genome_tree) %>%
            t() %>%
            as.data.frame() %>%
            rename(phylogenetic=1) %>%
            rownames_to_column(var="dataset")

# Merge alpha diversities
alpha_diversity <- richness %>%
      full_join(neutral,by=join_by(dataset==dataset)) %>%
      full_join(phylogenetic,by=join_by(dataset==dataset))

# Write alpha diversities
#alpha_diversity %>% write_tsv(paste0("results/alpha_",genus,".tsv"))

# Print alpha diversity
alpha_diversity %>%
  left_join(sample_metadata %>%
    select(dataset, Extraction, Sample), by = join_by(dataset == dataset)) %>%
  group_by(Extraction) %>%
  summarise(richness = mean(richness), neutral = mean(neutral), phylogenetic = mean(phylogenetic)) %>%
  tt()

Extraction	richness	neutral	phylogenetic
DREX1	53.69565	24.45051	4.093443
DREX2	53.45455	25.65059	4.011921
ZYMO	78.76190	35.06459	4.747699

alpha_data <- alpha_diversity %>%
  left_join(all_data,by= join_by(dataset==dataset))

alpha_data %>%
    dplyr::select(dataset,Library,Species,Taxon,Sample,Extraction, richness,neutral,phylogenetic) %>% 
    pivot_longer(!c(dataset,Library,Species,Taxon,Sample,Extraction), names_to = "metric", values_to = "value") %>%
    mutate(metric=factor(metric,levels=c("richness","neutral","phylogenetic"))) %>%
    mutate(Taxon=factor(Taxon,levels=c("Amphibian","Reptile","Mammal"))) %>%
    unique() %>%
    ggplot(aes(x=Extraction,y=value, color=Species, group=Extraction)) + 
        geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") + 
        geom_jitter() + 
        scale_color_manual(values=vertebrate_colors) +
        facet_grid(metric ~ Taxon, scales = "free") +
        theme_minimal() +
        labs(y="Diversity",x="Extraction method")

5.2.3 Richness

alpha_data %>%
  dplyr::select(dataset, Extraction, Sample, Species, richness, Taxon, catalogue) %>%
  unique() %>%
  filter(Taxon != "Control") %>%
  lmerTest::lmer(log(richness) ~ Taxon + Extraction + (1 | Species / Sample), data = ., REML = TRUE) %>%
  plot()

alpha_data %>%
  dplyr::select(dataset, Extraction, Sample, Species, richness, Taxon, catalogue) %>%
  unique() %>%
  filter(Taxon != "Control") %>%
  lmerTest::lmer(log(richness) ~ Taxon + Extraction + (1 | Species / Sample), data = ., REML = TRUE) %>%
  anova()

Type III Analysis of Variance Table with Satterthwaite's method
            Sum Sq Mean Sq NumDF  DenDF F value  Pr(>F)  
Taxon      11.1732  3.7244     3  8.416  5.8091 0.01921 *
Extraction  0.7084  0.3542     2 39.374  0.5525 0.57992  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

alpha_data %>%
  dplyr::select(dataset, Extraction, Sample, Species, richness, Taxon, catalogue) %>%
  unique() %>%
  filter(Taxon != "Control") %>%
  lmerTest::lmer(log(richness) ~ Taxon + Extraction + (1 | Species / Sample), data = ., REML = TRUE) %>%
  r.squaredGLMM()

           R2m       R2c
[1,] 0.3901002 0.7584841

## 2% of the variance explained by fixed effects
## 91% of the variance explained by random effects

alpha_data %>%
  dplyr::select(dataset, Extraction, Sample, Species, richness, Taxon, catalogue) %>%
  unique() %>%
  filter(Taxon != "Control") %>%
  lmerTest::lmer(log(richness) ~ Taxon + Extraction + (1 | Species / Sample), data = ., REML = TRUE) %>%
  VarCorr() %>%
  print(comp = "Variance")

 Groups         Name        Variance
 Sample:Species (Intercept) 0.52240 
 Species        (Intercept) 0.45551 
 Residual                   0.64113

# Within-taxon between-species variance: 59%
91 * (0.85583 / (0.85583 + 0.47587))

[1] 58.48204

# Within-species between-sample variance: 33%
91 * (0.47587 / (0.47587 + 0.85583))

[1] 32.51796

REF got slightly lower richness values than DREX and EHEX. Large variability between species within taxa.

5.2.4 Neutral

alpha_data %>%
  dplyr::select(dataset, Extraction, Sample, Species, neutral, Taxon, catalogue) %>%
  unique() %>%
  filter(Taxon != "Control") %>%
  lmerTest::lmer(neutral ~ Taxon + Extraction + (1 | Species / Sample), data = ., REML = TRUE) %>%
  plot()

alpha_data %>%
  dplyr::select(dataset, Extraction, Sample, Species, neutral, Taxon, catalogue) %>%
  unique() %>%
  filter(Taxon != "Control") %>%
  lmerTest::lmer(neutral ~ Taxon + Extraction + (1 | Species / Sample), data = ., REML = TRUE) %>%
  anova()

Type III Analysis of Variance Table with Satterthwaite's method
           Sum Sq Mean Sq NumDF  DenDF F value Pr(>F)  
Taxon      909.42  303.14     3  8.324  2.3875 0.1418  
Extraction 790.15  395.08     2 39.139  3.1116 0.0557 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

alpha_data %>%
  dplyr::select(dataset, Extraction, Sample, Species, neutral, Taxon, catalogue) %>%
  unique() %>%
  filter(Taxon != "Control") %>%
  lmerTest::lmer(neutral ~ Taxon + Extraction + (1 | Species / Sample), data = ., REML = TRUE) %>%
  r.squaredGLMM()

           R2m       R2c
[1,] 0.2662611 0.7706441

## 2% of the variance explained by fixed effects
## 91% of the variance explained by random effects

alpha_data %>%
  dplyr::select(dataset, Extraction, Sample, Species, neutral, Taxon, catalogue) %>%
  unique() %>%
  filter(Taxon != "Control") %>%
  lmerTest::lmer(neutral ~ Taxon + Extraction + (1 | Species / Sample), data = ., REML = TRUE) %>%
  VarCorr() %>%
  print(comp = "Variance")

 Groups         Name        Variance
 Sample:Species (Intercept) 120.83  
 Species        (Intercept) 158.39  
 Residual                   126.97

# Within-taxon between-species variance: 69%
89 * (286.830 / (286.830 + 81.267))

[1] 69.35093

# Within-species between-sample variance: 20%
89 * (81.267 / (286.830 + 81.267))

[1] 19.64907

No significant differences in neutral diversity between extraction methods and taxonomic groups. Neutral diversity very species specific.

5.2.5 Phylogenetic

alpha_data %>%
  dplyr::select(dataset, Extraction, Sample, Species, phylogenetic, Taxon, catalogue) %>%
  unique() %>%
  filter(Taxon != "Control") %>%
  lmerTest::lmer(phylogenetic ~ Taxon + Extraction + (1 | Species / Sample), data = ., REML = TRUE) %>%
  plot()

alpha_data %>%
  dplyr::select(dataset, Extraction, Sample, Species, phylogenetic, Taxon, catalogue) %>%
  unique() %>%
  filter(Taxon != "Control") %>%
  lmerTest::lmer(phylogenetic ~ Taxon + Extraction + (1 | Species / Sample), data = ., REML = TRUE) %>%
  anova()

Type III Analysis of Variance Table with Satterthwaite's method
            Sum Sq Mean Sq NumDF  DenDF F value  Pr(>F)  
Taxon      11.1581  3.7194     3  8.421  3.6047 0.06232 .
Extraction  1.7646  0.8823     2 39.188  0.8551 0.43302  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

alpha_data %>%
  dplyr::select(dataset, Extraction, Sample, Species, phylogenetic, Taxon, catalogue) %>%
  unique() %>%
  filter(Taxon != "Control") %>%
  lmerTest::lmer(phylogenetic ~ Taxon + Extraction + (1 | Species / Sample), data = ., REML = TRUE) %>%
  r.squaredGLMM()

           R2m       R2c
[1,] 0.2661818 0.7436036

## 6% of the variance explained by fixed effects
## 90% of the variance explained by random effects

alpha_data %>%
  dplyr::select(dataset, Extraction, Sample, Species, phylogenetic, Taxon, catalogue) %>%
  unique() %>%
  filter(Taxon != "Control") %>%
  lmerTest::lmer(phylogenetic ~ Taxon + Extraction + (1 | Species / Sample), data = ., REML = TRUE) %>%
  VarCorr() %>%
  print(comp = "Variance")

 Groups         Name        Variance
 Sample:Species (Intercept) 1.74086 
 Species        (Intercept) 0.18041 
 Residual                   1.03180

# Within-taxon between-species variance: 43%
90 * (1.37930 / (1.37930 + 1.51968))

[1] 42.82092

# Within-species between-sample variance: 47%
90 * (1.51968 / (1.37930 + 1.51968))

[1] 47.17908

No significant differences in phylogenetic diversity between extraction methods and taxonomic groups.

5.3 Microbial complexity recovery

all_data %>%
    select(dataset,Extraction,microbial_fraction,MAG_mapping_percentage,Taxon,catalogue) %>%
    mutate(damr=pmin(1,MAG_mapping_percentage/(microbial_fraction))) %>%
    mutate(damr=ifelse(is.na(damr),0,damr)) %>%
    unique() %>% # what do we filter for?
    group_by(Taxon,Extraction) %>%
    summarise(value = sprintf("%.3f±%.3f", mean(damr), sd(damr))) %>%
    pivot_wider(names_from = Extraction, values_from = value) %>%
    tt(caption = "Mean and standard deviation of breadth of host genome coverage")

Mean and standard deviation of breadth of host genome coverage
Taxon	REF	DREX1	DREX2
Amphibian	0.755±0.346	0.735±0.337	0.755±0.335
Reptile	0.947±0.070	0.970±0.054	0.977±0.044
Mammal	0.768±0.144	0.889±0.165	0.831±0.154
Bird	0.856±0.197	0.793±0.261	0.724±0.319
Control	1.000±0.000	1.000±0.000	1.000±0.000

all_data %>%
    select(dataset,Extraction,microbial_fraction,MAG_mapping_percentage,Taxon,Sample,Species) %>%
    mutate(damr=pmin(1,MAG_mapping_percentage/(microbial_fraction))) %>%
    mutate(damr=ifelse(is.na(damr),0,damr)) %>%
    unique() %>%
    ggplot(aes(x=Extraction,y=damr, color=Species, group=Extraction)) + 
        geom_boxplot(outlier.shape = NA, fill="#f4f4f4", color="#8c8c8c") + 
        geom_jitter() + 
        scale_color_manual(values=vertebrate_colors) +
        facet_grid(. ~ Taxon, scales = "free") +
        theme_minimal() +
        labs(y="Domain-adjusted mapping rate",x="Extraction method")

all_data  %>%
    select(dataset,Extraction,microbial_fraction,MAG_mapping_percentage,Taxon, Sample, Species,catalogue) %>%
    mutate(damr=pmin(1,MAG_mapping_percentage/(microbial_fraction))) %>%
    mutate(damr=ifelse(is.na(damr),0,damr)) %>%
    unique() %>%
    filter(Taxon != "Control") %>%
    MASS::glmmPQL(damr~ Taxon+Extraction,random=~1|Species/Sample,
               family="quasibinomial",data=.) %>%
    car::Anova(.,test.statistic="Wald")

Analysis of Deviance Table (Type II tests)

Response: zz
            Chisq Df Pr(>Chisq)  
Taxon      7.0478  3    0.07039 .
Extraction 0.7968  2    0.67138  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

data<-all_data  %>%
    select(dataset,Extraction,microbial_fraction,MAG_mapping_percentage,Taxon, Sample, Species,catalogue) %>%
    mutate(damr=pmin(1,MAG_mapping_percentage/(microbial_fraction))) %>%
    mutate(damr=ifelse(is.na(damr),0,damr)) %>%
    unique() %>%
    filter(Taxon != "Control") 


M<-MASS::glmmPQL(damr~ Taxon+Extraction,random=~1|Species/Sample,
               family="quasibinomial",data=data)

Anova_table<-car::Anova(M,test.statistic="Wald",type="III")%>%
    mutate(F=Chisq/Df) # Approximate F value
n <- nrow(M$data)  # Total number of observations
df_fixed <- length(fixef(M))  # Number of fixed effect parameters
df_random <- length(ranef(M))  # Number of random effect parameters (or levels)
approx_residual_df <- n - df_fixed - df_random # Approx. residual Df
data.frame(Anova_table,
           p_value=pf(Anova_table$F, Anova_table$Df, approx_residual_df, lower.tail = FALSE))%>%
  select(F,p_value)

                    F    p_value
(Intercept) 5.0023896 0.02880432
Taxon       2.3492555 0.08075295
Extraction  0.3984138 0.67303769

broom.mixed::tidy(M) %>%
    tt()

effect	term	estimate	std.error	df	statistic	p.value
fixed	(Intercept)	1.33363650	0.6227920	46	2.1413836	0.03757338
fixed	TaxonReptile	2.30012902	0.9535923	8	2.4120676	0.04237128
fixed	TaxonMammal	0.40556099	0.8473819	8	0.4786048	0.64502663
fixed	TaxonBird	0.40804767	0.8579913	8	0.4755849	0.64708519
fixed	ExtractionDREX1	0.15284789	0.2883826	46	0.5300177	0.59864887
fixed	ExtractionDREX2	-0.08950725	0.2795925	46	-0.3201346	0.75031595

plot_model(M,type="pred",terms=c("Taxon","Extraction"))

## 27% of the variance explained by fixed effects
## 27% of the variance explained by random effects

VarCorr(M)

            Variance     StdDev      
Species =   pdLogChol(1)             
(Intercept) 1.073914e-07 0.0003277063
Sample =    pdLogChol(1)             
(Intercept) 1.679619e+00 1.2960011568
Residual    9.176235e-02 0.3029230022

# Within-taxon between-species variance: 0%
27*(2.545883e-08/(2.545883e-08+1.911908e+00))

[1] 3.595301e-07

# Within-species between-sample variance: 27%
27*(1.911908e+00/(2.545883e-08+1.911908e+00))

[1] 27

All three extraction methods yield similar DAMRs.

5.3.1 Community barplot

# Retrieve EHI taxonomy colors
phylum_colors <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
  right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    select(phylum, colors) %>%
    unique() %>%
    arrange(phylum) %>%
    select(colors) %>%
    pull()

# Stacked barplot
genome_counts %>%
    mutate_at(vars(-genome), funs(./sum(., na.rm = TRUE)))  %>% #apply TSS nornalisation
    pivot_longer(-genome, names_to = "dataset", values_to = "count") %>%
    left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append taxonomy
    left_join(., sample_metadata, by = join_by(dataset == dataset)) %>%
    mutate(Taxon=factor(Taxon,levels=c("Amphibian",
                                       "Reptile",
                                       "Mammal",
                                       "Bird",
                                       "Control"))) %>%
    mutate(Species=factor(Species,levels=c("Calotriton asper",
                                           "Lissotriton helveticus",
                                           "Salamandra atra",
                                           "Chalcides striatus",
                                           "Natrix astreptophora",
                                           "Podarcis muralis",
                                           "Plecotus auritus",
                                           "Sciurus carolinensis",
                                           "Trichosurus vulpecula",
                                           "Geospizopsis unicolor",
                                           "Perisoreus infaustus",
                                           "Zonotrichia capensis",
                                           "Extraction control",
                                           "Library control"))) %>%
    filter(Taxon != "Control") %>%
    ggplot(., aes(x=dataset,y=count,fill=phylum, group=phylum))+ #grouping enables keeping the same sorting of taxonomic units
          geom_bar(stat="identity", colour="white", linewidth=0.1)+ #plot stacked bars with white borders
          scale_fill_manual(values=phylum_colors) +
          labs(y = "Relative abundance") +
          facet_nested(. ~ Taxon + Species + Sample + Extraction, scales="free_x") +
          guides(fill = guide_legend(ncol = 3)) +
          theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
                          axis.title.x = element_blank(),
                panel.background = element_blank(),
                panel.border = element_blank(),
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                axis.line = element_line(linewidth = 0.5, linetype = "solid", colour = "black"),
                legend.title=element_blank(),
                panel.spacing = unit(0, "lines"))

#ggsave(paste0("figures/barplot_",genus,".pdf"))

genome_counts_filt_variance <- genome_counts_filt %>%
            column_to_rownames(var="genome") %>% 
            select(all_of(sample_metadata %>%
                            filter(!Taxon %in% c("Birds", "Control")) %>% # exclude birds
                            pull(dataset))) %>% 
            select(where(~!all(. == 0)))

# Perform betadisper analysis
richness_betadisper_result <- genome_counts_filt_variance %>%
  hillpair(.,q=0, metric="C") %>%
  as.dist() %>% 
  betadisper(., sample_metadata %>% 
                      filter(dataset %in% colnames(genome_counts_filt_variance)) %>%
                      arrange(match(dataset,colnames(genome_counts_filt_variance))) %>% pull(Extraction))
# Check if there are significant differences in dispersion
permutest(richness_betadisper_result,pairwise = TRUE)  # Tests homogeneity of dispersion and pairwise comparisons between groups using permutations.


Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999

Response: Distances
          Df   Sum Sq   Mean Sq      F N.Perm Pr(>F)
Groups     2 0.009035 0.0045175 1.1444    999   0.33
Residuals 60 0.236859 0.0039477                     

Pairwise comparisons:
(Observed p-value below diagonal, permuted p-value above diagonal)
        DREX1   DREX2  ZYMO
DREX1         0.57300 0.203
DREX2 0.55115         0.341
ZYMO  0.20658 0.35323

TukeyHSD(richness_betadisper_result) # test significance between each group

  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = distances ~ group, data = df)

$group
                    diff         lwr        upr     p adj
DREX2-DREX1 -0.007193598 -0.05379165 0.03940445 0.9270330
ZYMO-DREX1  -0.028224995 -0.07482304 0.01837305 0.3194398
ZYMO-DREX2  -0.021031398 -0.06762945 0.02556665 0.5272157

richness <- genome_counts_filt_variance %>%
            hillpair(.,q=0, metric="C") %>%
            as.dist() %>% # nrow(richness) 
            adonis2(formula = . ~ Extraction+Taxon+Species+Sample, 
                    data = sample_metadata %>% 
                      filter(dataset %in% colnames(genome_counts_filt_variance)) %>%
                      arrange(match(dataset,colnames(genome_counts_filt_variance))), 
                    permutations = 999, 
                    by="terms") %>%
            broom::tidy()
richness

# A tibble: 6 × 6
  term          df SumOfSqs     R2 statistic p.value
  <chr>      <dbl>    <dbl>  <dbl>     <dbl>   <dbl>
1 Extraction     2    0.575 0.0208      3.62   0.001
2 Taxon          3    6.62  0.239      27.7    0.001
3 Species        8   13.2   0.475      20.7    0.001
4 Sample        11    4.31  0.156       4.93   0.001
5 Residual      38    3.02  0.109      NA     NA    
6 Total         62   27.7   1          NA     NA

# Perform betadisper analysis
neutral_betadisper_result <- genome_counts_filt_variance %>%
  hillpair(.,q=1, metric="C") %>%
  as.dist() %>% 
  betadisper(., sample_metadata %>% 
                      filter(dataset %in% colnames(genome_counts_filt_variance)) %>%
                      arrange(match(dataset,colnames(genome_counts_filt_variance))) %>% pull(Extraction))
# Check if there are significant differences in dispersion
permutest(neutral_betadisper_result,pairwise=TRUE)  # Tests homogeneity of dispersion and pairwise comparisons between groups using permutations.


Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999

Response: Distances
          Df   Sum Sq   Mean Sq      F N.Perm Pr(>F)
Groups     2 0.009171 0.0045854 1.3738    999   0.26
Residuals 60 0.200261 0.0033377                     

Pairwise comparisons:
(Observed p-value below diagonal, permuted p-value above diagonal)
        DREX1   DREX2  ZYMO
DREX1         0.42700 0.171
DREX2 0.42982         0.343
ZYMO  0.16936 0.32804

neutral <- genome_counts_filt_variance %>%
            hillpair(.,q=1, metric="C") %>%
            as.dist() %>%
            adonis2(formula = . ~ Extraction+Taxon+Species+Sample, 
                    data = sample_metadata %>% 
                      filter(dataset %in% colnames(genome_counts_filt_variance)) %>%
                      arrange(match(dataset,colnames(genome_counts_filt_variance))), 
                    permutations = 999, 
                    by="terms") %>%
            broom::tidy()
neutral

# A tibble: 6 × 6
  term          df SumOfSqs     R2 statistic p.value
  <chr>      <dbl>    <dbl>  <dbl>     <dbl>   <dbl>
1 Extraction     2    0.447 0.0160      5.29   0.001
2 Taxon          3    6.83  0.244      53.9    0.001
3 Species        8   13.4   0.479      39.7    0.001
4 Sample        11    5.68  0.203      12.2    0.001
5 Residual      38    1.61  0.0574     NA     NA    
6 Total         62   28.0   1          NA     NA

# Perform betadisper analysis
phylogenetic_betadisper_result <- genome_counts_filt_variance %>%
  hillpair(.,q=1, metric="C") %>%
  as.dist() %>% 
  betadisper(., sample_metadata %>% 
                      filter(dataset %in% colnames(genome_counts_filt_variance)) %>%
                      arrange(match(dataset,colnames(genome_counts_filt_variance))) %>% pull(Extraction))
# Check if there are significant differences in dispersion
permutest(phylogenetic_betadisper_result,pairwise = TRUE)  # Tests homogeneity of dispersion and pairwise comparisons between groups using permutations.


Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999

Response: Distances
          Df   Sum Sq   Mean Sq      F N.Perm Pr(>F)
Groups     2 0.009171 0.0045854 1.3738    999  0.287
Residuals 60 0.200261 0.0033377                     

Pairwise comparisons:
(Observed p-value below diagonal, permuted p-value above diagonal)
        DREX1   DREX2  ZYMO
DREX1         0.46000 0.183
DREX2 0.42982         0.322
ZYMO  0.16936 0.32804

phylogenetic <- genome_counts_filt_variance %>%
            hillpair(.,q=1, tree=genome_tree, metric="C") %>%
            as.dist() %>%
            adonis2(formula = . ~ Extraction+Taxon+Species+Sample, 
                    data = sample_metadata %>% 
                      filter(dataset %in% colnames(genome_counts_filt_variance)) %>%
                      arrange(match(dataset,colnames(genome_counts_filt_variance))), 
                    permutations = 999, 
                    by="terms") %>%
            broom::tidy()
phylogenetic

# A tibble: 6 × 6
  term          df SumOfSqs     R2 statistic p.value
  <chr>      <dbl>    <dbl>  <dbl>     <dbl>   <dbl>
1 Extraction     2    0.259 0.0259      4.96   0.001
2 Taxon          3    3.63  0.364      46.4    0.001
3 Species        8    3.67  0.368      17.6    0.001
4 Sample        11    1.41  0.142       4.93   0.001
5 Residual      38    0.991 0.0994     NA     NA    
6 Total         62    9.97  1          NA     NA

data.frame(
  Term = factor(rep(c("Extraction", "Taxon", "Species","Sample","Residual"), times = 3),levels = c("Extraction", "Taxon", "Species","Sample","Residual")),
  Metric = factor(rep(c("Richness", "Neutral", "Phylogenetic"), each = 5),levels = c("Richness", "Neutral", "Phylogenetic")),
  Value = c(richness$R2[-6],neutral$R2[-6],phylogenetic$R2[-6])
)%>%
  ggplot(., aes(x = Metric, y = Value, fill = Term)) +
  geom_bar(stat = "identity", position = "stack") +
  #scale_fill_brewer(palette = "Set1") +  # Choose a color palette
  scale_fill_manual(values = c("#47bfb6", "#4a7015", "#5e1717", "#9e6b24", "#4a4a4a")) +
  labs(
    x = "Beta diversity metric",
    y = "R-square",
    fill = "Term"
  ) +
  theme_minimal()

genome_counts_NMDS <- genome_counts_filt_variance %>%
            #column_to_rownames(var="genome") %>%
            select(where(~!all(. == 0))) %>%
            hillpair(.,q=1, metric="C", out="dist") %>%
            metaMDS(.,trymax = 999, k=2, trace=0) %>%
            vegan::scores() %>%
            as_tibble(., rownames = "dataset") %>%
            left_join(sample_metadata, by = join_by(dataset == dataset)) %>%
            filter(Taxon != "Control") %>%
            group_by(Sample) %>%
            mutate(sample_x=mean(NMDS1), sample_y=mean(NMDS2))

genome_counts_NMDS %>%
            ggplot(., aes(x=NMDS1,y=NMDS2, color=Species, shape=Extraction)) +
                scale_color_manual(values=vertebrate_colors) +
                geom_point(size=3, alpha=0.8) +
                geom_segment(aes(x=sample_x, y=sample_y, xend=NMDS1, yend=NMDS2), alpha=0.2) +
                theme_classic() +
                theme(legend.position="right", legend.box="vertical") +
                guides(color=guide_legend(title="Species"))