6 Microbial metagenomics

6.1 Microbial DNA fraction

6.1.1 Data overview

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    group_by(sample_type) %>%
    summarise(mean=mean(singlem_fraction, na.rm=T),
              sd=sd(singlem_fraction, na.rm=T),
              median=median(singlem_fraction, na.rm = TRUE),
              IQR=IQR(singlem_fraction, na.rm = TRUE)) %>% 
    tt()
tinytable_co05l6lt7zy7pcbdgzin
sample_type mean sd median IQR
Anal/cloacal swab 0.2598510 0.3407707 0.0022 0.574700
Faecal 0.5203527 0.2759433 0.6270 0.350875
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    group_by(tax_group) %>%
    summarise(mean=mean(singlem_fraction, na.rm=T),
              sd=sd(singlem_fraction, na.rm=T),
              median=median(singlem_fraction, na.rm = TRUE),
              IQR=IQR(singlem_fraction, na.rm = TRUE)) %>% 
    tt()
tinytable_c7iw5unya0k4gnykovr5
tax_group mean sd median IQR
Amphibians 0.5807892 0.1799217 0.62900 0.128200
Bats 0.2903304 0.2949997 0.20650 0.523400
Birds 0.2777113 0.3360460 0.07850 0.556025
Mammals 0.5871579 0.2841097 0.67420 0.175250
Reptiles 0.5885336 0.1701545 0.63475 0.114050

6.1.2 Statistical test

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(!is.na(singlem_fraction))%>%
    mutate(singlem_fraction=case_when(singlem_fraction>1~1,
                                      singlem_fraction<=1~singlem_fraction))%>%
    glm(singlem_fraction ~  tax_group + sample_type, data = .,family=quasibinomial)  %>%
    Anova(.,test.statistic = "F",type = "III") %>%
    tidy()%>%
    tt()
tinytable_5ykue44mzlxr1ml4wcg7
term sumsq df F.values p.value
tax_group 148.8863 4 119.1247 2.466079e-91
sample_type 51.7346 1 165.5725 1.784789e-36
Residuals 630.8543 2019 NA NA
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(!is.na(singlem_fraction))%>%
    mutate(singlem_fraction=case_when(singlem_fraction>1~1,
                                      singlem_fraction<=1~singlem_fraction),
           sample_type=factor(sample_type),
           tax_group=factor(tax_group))%>%
    glm(singlem_fraction ~  tax_group + sample_type, data = .,family=quasibinomial)  %>%
    glht(.,linfct = mcp(tax_group = "Tukey"))%>%
    summary()%>%
    tidy()%>%
    tt()
tinytable_6jd68m30pzmymzjvdf2d
term contrast null.value estimate std.error statistic adj.p.value
tax_group Bats - Amphibians 0 -1.25097357 0.10967066 -11.4066383 0.000000e+00
tax_group Birds - Amphibians 0 -0.88809563 0.11334382 -7.8354131 2.187139e-14
tax_group Mammals - Amphibians 0 0.16650972 0.10026122 1.6607589 4.506772e-01
tax_group Reptiles - Amphibians 0 0.03298124 0.10207377 0.3231118 9.975622e-01
tax_group Birds - Bats 0 0.36287794 0.09724522 3.7315759 1.813404e-03
tax_group Mammals - Bats 0 1.41748328 0.08113495 17.4706869 0.000000e+00
tax_group Reptiles - Bats 0 1.28395481 0.08304328 15.4612727 0.000000e+00
tax_group Mammals - Birds 0 1.05460535 0.08202966 12.8563901 0.000000e+00
tax_group Reptiles - Birds 0 0.92107687 0.08779908 10.4907351 0.000000e+00
tax_group Reptiles - Mammals 0 -0.13352848 0.07012718 -1.9040901 3.080509e-01

6.1.3 Plot

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    mutate(host_percentage= host_bases/bases_post_fastp*100)  %>% #convert bases to gigabases (GB)
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
    ggplot(aes(y=singlem_fraction, x=tax_group, color=tax_group, fill=tax_group, group=tax_group)) +
        ylim(0, 1) +
        geom_jitter(position = position_jitter(width = 0.2), alpha = 0.5, size=0.5) +
        stat_halfeye(adjust = 0.5,width = 0.5, .width = 0, justification = -.55,normalize = "groups") +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        theme_classic() +
        facet_grid(~sample_type) +
        labs(y="Host percentage", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/microbialdata_taxa.pdf",width=9, height=4, units="in")
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    mutate(host_percentage= host_bases/bases_post_fastp*100)  %>% #convert bases to gigabases (GB)
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
    ggplot(aes(y=singlem_fraction, x=sample_type, group=sample_type)) +
        stat_halfeye(adjust = 1, width = 0.5, .width = 0, justification = 0,normalize = "groups") +
        theme_classic() +
        labs(y="Host percentage", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/microbialdata_taxa_all.pdf",width=9, height=4, units="in")

6.2 Domain-adjusted mapping rate

6.2.1 Data summary

left_join(read_tsv("data/mapping.tsv"),
          read_tsv("data/preprocessing.tsv"),
          by="sequence_id") %>%
    left_join(read_tsv("data/sample.tsv"), by="sample_id") %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,100,singlem_fraction*100)) %>% 
    mutate(damr=ifelse(singlem_fraction<mapping_percentage,100,(mapping_percentage/singlem_fraction)*100)) %>%
    filter(singlem_fraction>0) %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    group_by(sample_type) %>%
    summarise(mean=mean(damr, na.rm=T),
              sd=sd(damr, na.rm=T),
              median=median(damr, na.rm = TRUE),
              IQR=IQR(damr, na.rm = TRUE)) %>% 
    tt()
tinytable_c3assq5gxt002zqx9cwk
sample_type mean sd median IQR
Anal/cloacal swab 72.84533 22.25244 76.29903 30.61081
Faecal 86.58154 25.00932 100.00000 15.61039
left_join(read_tsv("data/mapping.tsv"),
          read_tsv("data/preprocessing.tsv"),
          by="sequence_id") %>%
    left_join(read_tsv("data/sample.tsv"), by="sample_id") %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,100,singlem_fraction*100)) %>% 
    mutate(damr=ifelse(singlem_fraction<mapping_percentage,100,(mapping_percentage/singlem_fraction)*100)) %>%
    filter(singlem_fraction>0) %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    group_by(tax_group) %>%
    summarise(mean=mean(damr, na.rm=T),
              sd=sd(damr, na.rm=T),
              median=median(damr, na.rm = TRUE),
              IQR=IQR(damr, na.rm = TRUE)) %>% 
    tt()
tinytable_dr9lmmhyjh1k3ehj6ok1
tax_group mean sd median IQR
Amphibians 89.19356 24.84575 100.00000 5.927094
Bats 80.94051 24.67483 91.76713 30.814236
Birds 67.03815 24.99000 72.18568 29.811183
Mammals 91.06092 16.77450 100.00000 11.191039
Reptiles 84.52325 31.14150 100.00000 10.748612

6.2.2 Statistical test

left_join(read_tsv("data/mapping.tsv"),
          read_tsv("data/preprocessing.tsv"),
          by="sequence_id") %>%
    left_join(read_tsv("data/sample.tsv"), by="sample_id") %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,11,singlem_fraction)) %>% 
    mutate(damr=ifelse(singlem_fraction<mapping_percentage,1,mapping_percentage/singlem_fraction)) %>%
    filter(singlem_fraction>0) %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(!is.na(damr)) %>%
    glm(damr ~ sample_type + tax_group, data = .,family=quasibinomial)  %>%
    Anova(test.statistic = "F",type="III") %>%
    tidy()%>%
    tt()
tinytable_necg0jn0hlxmm5c56mp2
term sumsq df F.values p.value
sample_type 13.49334 1 19.83706 8.794973e-06
tax_group 76.64409 4 28.16933 6.244313e-23
Residuals 1742.01471 2561 NA NA
left_join(read_tsv("data/mapping.tsv"),
          read_tsv("data/preprocessing.tsv"),
          by="sequence_id") %>%
    left_join(read_tsv("data/sample.tsv"), by="sample_id") %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,11,singlem_fraction)) %>% 
    mutate(damr=ifelse(singlem_fraction<mapping_percentage,1,mapping_percentage/singlem_fraction)) %>%
    filter(singlem_fraction>0) %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(!is.na(damr)) %>%
    mutate(sample_type=factor(sample_type),
           tax_group=factor(tax_group))%>%
    glm(damr ~ sample_type + tax_group, data = .,family=quasibinomial) %>%
    glht(.,linfct = mcp(tax_group = "Tukey"))%>%
    summary()%>%
    tidy()%>%
    tt()
tinytable_9v3162ghj07dyroxsaxd
term contrast null.value estimate std.error statistic adj.p.value
tax_group Bats - Amphibians 0 -1.5736540 1.1220474 -1.4024844 5.929805e-01
tax_group Birds - Amphibians 0 -2.7097758 1.0700747 -2.5323240 7.226328e-02
tax_group Mammals - Amphibians 0 0.3210380 1.1686087 0.2747181 9.985223e-01
tax_group Reptiles - Amphibians 0 -3.2127466 1.0255111 -3.1328247 1.272998e-02
tax_group Birds - Bats 0 -1.1361218 0.5793468 -1.9610391 2.562934e-01
tax_group Mammals - Bats 0 1.8946920 0.7458078 2.5404560 7.066287e-02
tax_group Reptiles - Bats 0 -1.6390926 0.4921951 -3.3301684 6.400175e-03
tax_group Mammals - Birds 0 3.0308138 0.6650554 4.5572350 3.762506e-05
tax_group Reptiles - Birds 0 -0.5029708 0.3582254 -1.4040625 5.917186e-01
tax_group Reptiles - Mammals 0 -3.5337846 0.5906877 -5.9824922 1.550295e-08

6.2.3 Plot

left_join(read_tsv("data/mapping.tsv"),
          read_tsv("data/preprocessing.tsv"),
          by="sequence_id") %>%
    left_join(read_tsv("data/sample.tsv"), by="sample_id") %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,100,singlem_fraction*100)) %>% 
    mutate(damr=ifelse(singlem_fraction<mapping_percentage,100,(mapping_percentage/singlem_fraction)*100)) %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(singlem_fraction>0) %>% 
    ggplot(aes(y=damr, x=sample_type, color=sample_type, fill=sample_type, group=sample_type)) +
        geom_boxplot(outlier.shape = NA) +
        scale_color_manual(values = c("#bdca50", "#AA3377")) +   
        scale_fill_manual(values = c("#bdca5080", "#AA337780")) +
        theme_classic() +
        labs(y="DAMR", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/damr_type.pdf",width=5, height=4, units="in")
left_join(read_tsv("data/mapping.tsv"),
          read_tsv("data/preprocessing.tsv"),
          by="sequence_id") %>%
    left_join(read_tsv("data/sample.tsv"), by="sample_id") %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,100,singlem_fraction*100)) %>% 
    mutate(damr=ifelse(singlem_fraction<mapping_percentage,100,(mapping_percentage/singlem_fraction)*100)) %>% #convert bases to gigabases (GB)
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(singlem_fraction>0) %>% 
    mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
    ggplot(aes(y=damr, x=tax_group, color=tax_group, fill=tax_group, group=tax_group)) +
        geom_jitter(position = position_jitter(width = 0.2), alpha = 0.5, size=0.5) +
        stat_halfeye(adjust = 0.5,width = 0.5, .width = 0, justification = -.55,normalize = "groups") +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        theme_classic() +
        labs(y="DAMR", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/damr_taxa.pdf",width=9, height=4, units="in")

6.3 Assemblies

6.3.1 Data summary

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    group_by(sample_type) %>%
    summarise(mean=mean(assembly_length, na.rm=T),
              sd=sd(assembly_length, na.rm=T),
              median=median(assembly_length, na.rm = TRUE),
              IQR=IQR(assembly_length, na.rm = TRUE)) %>% 
    tt()
tinytable_cbbsa6ucrpp99v5rusa2
sample_type mean sd median IQR
Anal/cloacal swab 13208042 34504175 0 0
Faecal 95955306 83010036 97047664 151385731
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    group_by(tax_group) %>%
    summarise(mean=mean(assembly_length, na.rm=T),
              sd=sd(assembly_length, na.rm=T),
              median=median(assembly_length, na.rm = TRUE),
              IQR=IQR(assembly_length, na.rm = TRUE)) %>% 
    tt()
tinytable_zf71f7nhhu6lrs3bj7fo
tax_group mean sd median IQR
Amphibians 116614689 68762523 121907456 88263039
Bats 31069176 77144825 0 41984504
Birds 8688016 26409919 0 0
Mammals 96415766 69841084 98624268 75754888
Reptiles 139349280 74190452 143970512 93046444

6.3.2 Statistical test

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    lm(rank(assembly_length) ~ sample_type + tax_group, data = .)  %>%
    Anova(type="III") %>%
    tidy()%>%
    tt()
tinytable_hspaf99uxd22elolp5v8
term sumsq df statistic p.value
(Intercept) 33267819 1 314.2467 3.650912e-64
sample_type 13028568 1 123.0674 1.427942e-27
tax_group 100440135 4 237.1885 5.790079e-159
Residuals 163985333 1549 NA NA

6.3.3 Plot

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
    ggplot(aes(y=assembly_length, x=tax_group, color=tax_group, fill=tax_group, group=tax_group)) +
        ylim(0,400000000)+
        geom_jitter(alpha = 0.3, width=0.3, size=0.5) +
        geom_violin() +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        theme_classic() +
        facet_grid(~sample_type, scale="free") +
        labs(y="Assembly size", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/assemblysize_taxa.pdf",width=9, height=4, units="in")
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    ggplot(aes(y=assembly_length, x=sample_type, group=sample_type)) +
          ylim(0,400000000)+
        stat_halfeye(adjust = 1, width = 0.5, .width = 0, justification = 0,normalize = "groups") +
        theme_classic() +
        labs(y="Assembly size", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/assemblysize_taxa_all.pdf",width=9, height=4, units="in")

6.4 Number of MAGs

6.4.1 Summary statistics

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    group_by(sample_type) %>%
    summarise(mean=mean(assembly_num_bins, na.rm=T),
              sd=sd(assembly_num_bins, na.rm=T),
              median=median(assembly_num_bins, na.rm = TRUE),
              IQR=IQR(assembly_num_bins, na.rm = TRUE)) %>% 
    tt()
tinytable_yhrk6nna4nldtafyjmx1
sample_type mean sd median IQR
Anal/cloacal swab 2.644068 6.971768 0 0
Faecal 18.386792 16.099835 19 29
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    group_by(tax_group) %>%
    summarise(mean=mean(assembly_num_bins, na.rm=T),
              sd=sd(assembly_num_bins, na.rm=T),
              median=median(assembly_num_bins, na.rm = TRUE),
              IQR=IQR(assembly_num_bins, na.rm = TRUE)) %>% 
    tt()
tinytable_r3620mynnozhstpbj2ur
tax_group mean sd median IQR
Amphibians 23.758389 14.944334 24 18
Bats 3.692308 7.451098 0 5
Birds 1.582979 5.600340 0 0
Mammals 22.062635 16.345908 23 19
Reptiles 23.924107 13.736366 24 18

6.4.2 Statistical test

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    glm(assembly_num_bins ~ sample_type + tax_group, data = .,family=quasipoisson)  %>%
    Anova(test.statistic = "F",type = "III") %>%
    tidy()%>%
    tt()
tinytable_alcnhcnpo8lm322vbd9g
term sumsq df F.values p.value
sample_type 1781.293 1 131.4547 2.875375e-29
tax_group 9870.795 4 182.1096 5.599140e-128
Residuals 20989.917 1549 NA NA
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    mutate(sample_type=factor(sample_type),
           tax_group=factor(tax_group))%>%
    glm(assembly_num_bins ~ sample_type + tax_group, data = .,family=quasipoisson)  %>%
    glht(.,linfct = mcp(tax_group = "Tukey"))%>%
    summary()%>%
    tidy()%>%
    tt()
tinytable_wmg4hc7i6ufe0ir66tav
term contrast null.value estimate std.error statistic adj.p.value
tax_group Bats - Amphibians 0 -1.89252007 0.13396130 -14.12736443 0.0000000
tax_group Birds - Amphibians 0 -2.37578946 0.20160147 -11.78458429 0.0000000
tax_group Mammals - Amphibians 0 -0.02494121 0.07183869 -0.34718348 0.9962961
tax_group Reptiles - Amphibians 0 -0.00222888 0.07136120 -0.03123378 0.9999997
tax_group Birds - Bats 0 -0.48326938 0.22581601 -2.14010238 0.1759353
tax_group Mammals - Bats 0 1.86757887 0.12433118 15.02100124 0.0000000
tax_group Reptiles - Bats 0 1.89029119 0.12401954 15.24188255 0.0000000
tax_group Mammals - Birds 0 2.35084825 0.19506107 12.05185782 0.0000000
tax_group Reptiles - Birds 0 2.37356058 0.19518653 12.16047329 0.0000000
tax_group Reptiles - Mammals 0 0.02271233 0.05098722 0.44545131 0.9903572

6.4.3 Plot

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
    ggplot(aes(y=assembly_num_bins, x=tax_group, color=tax_group, fill=tax_group, group=tax_group)) +
        ylim(0,100) +
        geom_jitter(alpha = 0.6, width=0.3, size=0.5) +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        theme_classic() +
        facet_grid(~sample_type, scale="free") +
        labs(y="Number of MAGs", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/numberofbins_taxa.pdf",width=9, height=4, units="in")
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    ggplot(aes(y=assembly_num_bins, x=sample_type, group=sample_type)) +
        ylim(0,100) +
        stat_halfeye(adjust = 1, width = 0.5, .width = 0, justification = 0,normalize = "groups") +
        theme_classic() +
        labs(y="Number of MAGs", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/numberofbins_taxa_all.pdf",width=9, height=4, units="in")

6.5 MAG quality

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    left_join(read_tsv("data/mag.tsv"), by="assembly_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    ggplot(aes(y=mag_completeness, x=tax_group, color=tax_group, fill=tax_group, group=tax_group)) +
        ylim(50,100) +
        #geom_jitter(position = position_jitter(width = 0.2), alpha = 0.5, size=0.5) +
        stat_halfeye(adjust = 0.5, width = 0.5, .width = 0,normalize = "groups") +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        theme_classic() +
        labs(y="Number of bins", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/completeness_taxa.pdf",width=5, height=4, units="in")
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    left_join(read_tsv("data/mag.tsv"), by="assembly_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    ggplot(aes(y=mag_contamination, x=tax_group, color=tax_group, fill=tax_group, group=tax_group)) +
        ylim(0,10) +
        #geom_jitter(position = position_jitter(width = 0.2), alpha = 0.5, size=0.5) +
        stat_halfeye(adjust = 0.5, width = 0.5, .width = 0,normalize = "groups") +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        theme_classic() +
        labs(y="Number of bins", color="Taxa", fill="Taxa") +
        theme_classic()

ggsave("figures/contamination_taxa.pdf",width=5, height=4, units="in")

6.6 Assemblies vs MAGs

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    left_join(read_tsv("data/mag.tsv"), by="assembly_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual")%>%
    with(cor(assembly_num_bins,assembly_length))
[1] 0.8511555
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    left_join(read_tsv("data/mag.tsv"), by="assembly_id") %>% 
    filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
    filter(assembly_type == "Individual") %>% 
    ggplot(aes(y=assembly_num_bins, x=assembly_length, color=tax_group)) +
        geom_point(alpha=0.5, size=0.5) +
        scale_color_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        geom_smooth(method="lm",se=FALSE)+
        theme_classic() +
        labs(y="Number of bins", x="Assembly length", color="Taxa", fill="Taxa")

6.7 Microbial fraction vs MAGs

microbial_fraction_mag <- left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal")) %>%
    filter(assembly_type == "Individual") %>% 
    mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,100,singlem_fraction*100)) %>% 
    dplyr::select(assembly_num_bins,singlem_fraction,tax_group)

lm(assembly_num_bins ~ singlem_fraction * tax_group, data = microbial_fraction_mag) %>% 
  summary() %>% 
  tidy() %>% 
  tt()
tinytable_ciwg7bozrz8g5e224v84
term estimate std.error statistic p.value
(Intercept) 3.12532677 4.96020104 0.6300807 5.287470e-01
singlem_fraction 0.35631843 0.07871737 4.5265539 6.515925e-06
tax_groupReptiles -6.78245261 5.90661233 -1.1482813 2.510532e-01
tax_groupBirds -3.86068566 5.17541920 -0.7459658 4.558162e-01
tax_groupBats -1.02417632 5.13266089 -0.1995410 8.418692e-01
tax_groupMammals 0.77359372 5.47676727 0.1412501 8.876932e-01
singlem_fraction:tax_groupReptiles 0.10319272 0.09359558 1.1025385 2.704216e-01
singlem_fraction:tax_groupBirds -0.28420760 0.08484622 -3.3496788 8.310810e-04
singlem_fraction:tax_groupBats -0.31514636 0.08324979 -3.7855513 1.600051e-04
singlem_fraction:tax_groupMammals -0.06268436 0.08524577 -0.7353369 4.622604e-01
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal")) %>%
    filter(assembly_type == "Individual") %>% 
    mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,100,singlem_fraction*100)) %>% 
    ggplot(aes(y=assembly_num_bins, x=singlem_fraction, color=tax_group, fill=tax_group, group=tax_group)) +
        ylim(0,100) +
        geom_jitter(alpha = 0.6, width=0.3, size=0.5) +
        geom_smooth(method="lm") +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        theme_classic() +
        labs(y="Number of MAGs", color="Taxa", fill="Taxa") +
        theme_classic()

6.8 Microbial fraction vs Nonpareil

microbial_fraction_nonpareil <- left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal")) %>%
    filter(assembly_type == "Individual") %>% 
    mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,100,singlem_fraction*100)) %>% 
    dplyr::select(nonpareil_coverage,singlem_fraction,tax_group)

lm(nonpareil_coverage ~ singlem_fraction * tax_group, data = microbial_fraction_nonpareil) %>% 
  summary() %>% 
  tidy() %>% 
  tt()
tinytable_xoyra2aaghcdn4e29mf9
term estimate std.error statistic p.value
(Intercept) 0.469265681 0.0385474236 12.173724 1.911756e-32
singlem_fraction 0.005266754 0.0006117397 8.609470 1.986087e-17
tax_groupReptiles 0.197073303 0.0459023104 4.293320 1.884265e-05
tax_groupBirds 0.234670179 0.0402199577 5.834670 6.719289e-09
tax_groupBats 0.220349853 0.0398876683 5.524260 3.955751e-08
tax_groupMammals 0.106761296 0.0425618369 2.508381 1.224354e-02
singlem_fraction:tax_groupReptiles -0.002286717 0.0007273633 -3.143844 1.703285e-03
singlem_fraction:tax_groupBirds -0.002925969 0.0006593691 -4.437528 9.830169e-06
singlem_fraction:tax_groupBats -0.002763496 0.0006469627 -4.271492 2.075836e-05
singlem_fraction:tax_groupMammals -0.001383420 0.0006624741 -2.088263 3.695868e-02
left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
    left_join(read_tsv("data/assembly.tsv"), by="preprocessing_id") %>% 
    filter(sample_type %in% c("Faecal")) %>%
    filter(assembly_type == "Individual") %>% 
    mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
    mutate(singlem_fraction=ifelse(singlem_fraction>1,100,singlem_fraction*100)) %>% 
    ggplot(aes(y=nonpareil_coverage, x=singlem_fraction, color=tax_group, fill=tax_group, group=tax_group)) +
        geom_jitter(alpha = 0.6, width=0.3, size=0.5) +
        geom_smooth(method="lm") +
        scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
        scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
        theme_classic() +
        labs(y="Nonpareil completeness", color="Taxa", fill="Taxa") +
        theme_classic()