Chapter 7 Diversity analyses

This workflow performs diversity analyses of the count data generated through the EHI pipeline. Note the workflow relies on previous scripts, so make sure you go through the general pipeline before starting working on this document.

7.1 Data preparation

In order to avoid issues with diversity computation is recommendable to remove samples and MAGs without count data.

#Get list of present MAGs
present_MAGs <- count_table_cov_size %>%
        filter(rowSums(.[, -1]) != 0) %>%
        rownames()

#Remove samples with all zeros (no data after filtering)
count_table_cov_size <- count_table_cov_size %>%
  select_if(~!all(. == 0))

#Align KEGG annotations with present MAGs and remove all-zero and all-one traits
present_MAGs <- present_MAGs[present_MAGs %in% rownames(kegg_table)]
kegg_table_filt <- kegg_table[present_MAGs,] %>%
            select_if(~!all(. == 0)) %>%  #remove all-zero modules
            select_if(~!all(. == 1)) #remove all-one modules

#Filter count table to only contain present MAGs after KEGG filtering
count_table_cov_size_filt <- count_table_cov_size[present_MAGs,]

7.2 Alpha diversity

Alpha diversity metrics

q0n <- hilldiv(count_table_cov_size,q=0) %>% c()
q1n <- hilldiv(count_table_cov_size,q=1) %>% c()
q1p <- hilldiv(count_table_cov_size,q=1,tree=tree) %>% c()
dist <- traits2dist(kegg_table_filt, method="gower")
q1f <- hilldiv(count_table_cov_size_filt,q=1,dist=dist) %>% c()

# Merge all metrics
alpha_div <- cbind(sample=colnames(count_table_cov_size),richness=q0n,neutral=round(q1n,3),phylo=round(q1p,3),func=round(q1f,3)) %>%
  as.data.frame()
columns <- c("richness","neutral","phylo","func","mapped","total")

# Add amount of sequencing data to the table
alpha_div <- alpha_div %>%
  left_join(sequence_fractions, by = join_by(sample == sample)) %>% #add sequencing depth information
  mutate(mapped=round(mags_bases/1000000000,3)) %>% #modify depth to million reads
  mutate(total=round((mags_bases+unmapped_bases+host_bases+lowqual_bases)/1000000000,3)) %>%
  select(sample,richness,neutral,phylo,func,mapped,total) %>%
  mutate(across(-1, as.numeric))

Alpha diversity plots

alpha_div %>%
        pivot_longer(-sample, names_to = "data", values_to = "value") %>%
        mutate(data = factor(data, levels = columns))   %>%
        ggplot(aes(x=value, y=sample)) +
            geom_bar(stat='identity', fill="#6c9ebc") +
            facet_wrap(~data,  scales="free_x", ncol=6) +
            theme_classic() +
            theme(
                strip.background = element_blank(),
                panel.grid.minor.x = element_line( size=.1, color="grey" ),
                axis.title.x = element_blank(),
                axis.title.y = element_blank(),
                axis.text.x = element_text(angle = 45, hjust = 1)
            )
Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

You can also generate an HTML table using knitr.

kable(alpha_div)
sample richness neutral phylo func mapped total
EHI00696 190 101.237 6.692 1.518 3.430 5.868
EHI00687 75 51.579 5.086 1.523 0.783 1.297
EHI00757 123 64.415 7.042 1.626 2.567 4.455
EHI00686 82 17.581 3.732 1.373 3.094 4.307
EHI00693 159 72.791 5.608 1.450 3.975 5.656
EHI00697 54 41.198 6.811 1.476 0.386 1.067
EHI00688 114 52.637 4.804 1.485 2.812 3.850
EHI00730 215 113.479 10.131 1.521 2.565 5.078
EHI00692 162 60.243 6.321 1.485 3.244 5.099
EHI00676 159 82.850 6.521 1.469 2.401 3.758
EHI00682 85 46.715 9.146 1.568 0.780 1.596
EHI00679 137 69.732 7.719 1.500 1.645 3.306
EHI00690 95 44.051 4.643 1.420 1.424 2.459
EHI00683 125 81.839 7.024 1.481 1.189 2.022
EHI00733 147 60.117 4.491 1.427 3.688 5.706
EHI00685 89 40.760 4.801 1.491 4.730 6.188
EHI00752 70 41.494 8.117 1.589 3.176 8.595
EHI00681 69 28.127 4.050 1.440 1.093 1.528
EHI00700 144 88.972 6.817 1.473 1.660 2.480
EHI00680 132 65.208 5.530 1.529 2.736 3.559
EHI00751 113 64.296 6.796 1.528 2.999 4.552
EHI00742 113 56.852 5.937 1.505 3.071 5.697
EHI00675 128 65.964 8.662 1.495 1.592 3.102
EHI00732 117 70.557 4.418 1.458 1.360 3.986
EHI00684 120 59.392 5.552 1.436 1.498 3.132
EHI00694 121 46.891 5.099 1.448 3.452 5.636
EHI00743 165 78.383 5.206 1.467 4.395 6.805
EHI00731 144 89.203 5.833 1.537 1.648 2.853
EHI00698 97 20.448 3.936 1.366 1.399 2.199
EHI00699 142 68.270 8.327 1.452 1.931 2.983
EHI00674 60 32.495 3.939 1.410 0.669 1.039
EHI00753 89 55.487 8.387 1.551 1.742 3.324
EHI00695 179 91.539 5.833 1.437 4.036 5.528
EHI00678 135 71.700 6.178 1.466 2.403 3.815
EHI00689 62 20.056 3.682 1.374 1.551 2.029
EHI00677 121 50.998 4.342 1.485 6.818 8.344
EHI00691 81 35.247 5.524 1.441 2.738 3.846

Alpha diversity comparisons

Alpha diversities can be compared across any categorical features that group analysed samples (e.g., localities, sampling seasons, sex), or continuous variables associated with the host animals.

Let’s first create a nice colour palette for the localities

alpha_colors <- c("#e5bd5b","#6b7398","#76b183","#d57d2c","#2a2d26","#f9d4cc","#3c634e","#ea68c3")

Let’s also identify the number of comparing groups, so that the colour palette can be subsetted properly when plotting the figures.

group_n <- alpha_div %>% select(sample,neutral) %>%
        left_join(sample_table, by = join_by(sample == sample)) %>%
        mutate(location=paste0(round(longitude,2),"_",round(latitude,2))) %>%
        select(location) %>% pull() %>% unique() %>% length()

Neutral diversity

alpha_div %>%
            select(sample,neutral) %>%
            pivot_longer(-sample, names_to = "data", values_to = "value") %>%
            mutate(data = factor(data, levels = columns))   %>%
            left_join(sample_table, by = join_by(sample == sample)) %>%
            mutate(location=paste0(round(longitude,2),"_",round(latitude,2))) %>%
            ggboxplot(., x = "location", y = "value", color = "location", fill="location") +
                    scale_color_manual(values=alpha_colors[c(1:group_n)]) +
                    scale_fill_manual(values=paste0(alpha_colors[c(1:group_n)],"50")) +
                    stat_compare_means() +
                    theme_classic() +
                    labs(y = "Neutral Hill numbers") +
                    theme(
                        legend.position = "top",
                        legend.box = "horizontal",
                        axis.title.x = element_blank(),
                        axis.text.x = element_blank()) +
                    guides(color=guide_legend(title="Location"), fill="none")

Phylogenetic diversity

alpha_div %>%
            select(sample,phylo) %>%
            pivot_longer(-sample, names_to = "data", values_to = "value") %>%
            mutate(data = factor(data, levels = columns))   %>%
            left_join(sample_table, by = join_by(sample == sample)) %>%
            mutate(location=paste0(round(longitude,2),"_",round(latitude,2))) %>%
            ggboxplot(., x = "location", y = "value", color = "location", fill="location") +
            scale_color_manual(values=alpha_colors[c(1:group_n)]) +
            scale_fill_manual(values=paste0(alpha_colors[c(1:group_n)],"50")) +
            stat_compare_means() +
            theme_classic() +
            labs(y = "Phylogenetic Hill numbers") +
            theme(
                legend.position = "none",
                axis.title.x = element_blank(),
                axis.text.x = element_blank())

Functional diversity

alpha_div %>%
            select(sample,func) %>%
            pivot_longer(-sample, names_to = "data", values_to = "value") %>%
            mutate(data = factor(data, levels = columns))   %>%
            left_join(sample_table, by = join_by(sample == sample)) %>%
            mutate(location=paste0(round(longitude,2),"_",round(latitude,2))) %>%
            ggboxplot(., x = "location", y = "value", color = "location", fill="location") +
            scale_color_manual(values=alpha_colors[c(1:group_n)]) +
            scale_fill_manual(values=paste0(alpha_colors[c(1:group_n)],"50")) +
            stat_compare_means() +
            theme_classic() +
            labs(y = "Functional Hill numbers") +
            theme(
                legend.position = "none",
                axis.title.x = element_blank(),
                axis.text.x = element_blank())

Relationship between alpha diversity and sequencing effort

The microbial diversity in a sample is often correlated with sequencing depth, mainly when the sequencing effort is not large enough to properly capture the entirety of the diversity in the sample. The following plot shows the relationship between neutral diversity (Hill number of q=1, Shannon diversity) and the GBs mapped to the MAG catalogue. The size of the dots indicates the total sequencing effort performed for that sample (including host DNA, non-mapped reads, etc.). The flatter the regression line the lower is the correlation, and thus lower is the effect of sequencing depth in the results. However, it must be noted that this analysis only compares the sequencing depth in relation to the MAG catalogue, which might not be complete or representative of the system. Hence, the results shown here must be interpreted cautiously.

ggplot(alpha_div, aes(x=mapped,y=neutral,label=sample)) +
        geom_smooth(method='lm', formula= y~x, color='#e08dde', fill='#e08dde') +
      geom_point(alpha=0.5, color="#6c9ebc") +
      geom_label_repel(max.overlaps = 100, cex=0.7) +
      labs(x = "GBs mapped to MAGs", y = "Neutral diversity (effective number of MAGs)") +
      theme_classic() +
            theme(legend.position="none")

7.3 Beta diversity

Pairwise beta diversities quantify the dissimilarity between samples. Within the Hill numbers framework, beta diversities can be calculated for neutral, phylogenetic and functional diversities, at the desired order of diversity (q-value). Often times, beta diversities differ depending on the components of the diversity. For instance, phylogenetic and functional diversities often display lower beta values than neutral diversities due to phylogenetic and functional redundancy of the microbiota (i.e. different yet closely related and functionally similar MAGs replace each other). Pairwise beta diversities can be displayed in a 2-dimensional ordination using NMDS, and can be used to test for compositional differences between variables of interest. Note that outliers (often technical failures) can distort the ordination considerably.

Beta diversity test

beta_colors <- c("#e5bd5b","#6b7398","#76b183","#d57d2c","#2a2d26","#f9d4cc","#3c634e","#ea68c3")
beta_q1n <- hillpair(count_table_cov_size, q=1, metric="S")
The estimated time for calculating the 666 pairwise combinations is 4 seconds.
sample_table_adonis <- sample_table %>%
    filter(sample %in% labels(beta_q1n)) %>%
    arrange(sample) %>%
    mutate(location=paste0(round(longitude,2),"_",round(latitude,2))) %>%
    select(sample,location,region,country,species,sex,sample_type) %>%
    select_if(~ length(unique(.)) > 1) %>% #remove columns with all-identical values
    column_to_rownames(var = "sample") %>%
    as.data.frame()
    adonis2(formula=beta_q1n ~ ., data=sample_table_adonis[labels(beta_q1n),], permutations=999) %>%
            as.matrix() %>%
            kable()
Df SumOfSqs R2 F Pr(>F)
location 6 3.9382500 0.3061657 2.2044071 0.001
sex 1 0.2899632 0.0225422 0.9738289 0.455
Residual 29 8.6349184 0.6712921 NA NA
Total 36 12.8631316 1.0000000 NA NA

Beta diversity plot

beta_q1n_nmds <- beta_q1n %>%
                metaMDS(.,trymax = 500, k=2, verbosity=FALSE) %>%
                vegan::scores() %>%
                as_tibble(., rownames = "sample") %>%
                left_join(sample_table, by = join_by(sample == sample))
group_n <- length(unique(beta_q1n_nmds$region))

beta_q1n_nmds %>%
            group_by(region) %>%
            mutate(x_cen = mean(NMDS1, na.rm = TRUE)) %>%
            mutate(y_cen = mean(NMDS2, na.rm = TRUE)) %>%
            ungroup() %>%
            ggplot(., aes(x=NMDS1,y=NMDS2, color=region)) +
                scale_color_manual(values=beta_colors[c(1:group_n)]) +
                geom_point(size=2) +
                geom_segment(aes(x=x_cen, y=y_cen, xend=NMDS1, yend=NMDS2), alpha=0.2) +
                theme_classic() +
                theme(legend.position="right", legend.box="vertical") +
                guides(color=guide_legend(title="Region"))