5 Host genomics
5.1 Host DNA fraction
5.1.1 Data overview
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
mutate(host_percentage= host_bases/bases_post_fastp*100) %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
group_by(sample_type) %>%
summarise(mean=mean(host_percentage, na.rm=T),
sd=sd(host_percentage, na.rm=T),
median=median(host_percentage, na.rm = TRUE),
IQR=IQR(host_percentage, na.rm = TRUE)) %>%
tt()
sample_type | mean | sd | median | IQR |
---|---|---|---|---|
Anal/cloacal swab | 75.76413 | 32.05899 | 89.31666 | 36.46366 |
Faecal | 22.94532 | 32.09473 | 6.09630 | 31.88080 |
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
mutate(host_percentage= host_bases/bases_post_fastp*100) %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
group_by(tax_group) %>%
summarise(mean=mean(host_percentage, na.rm=T),
sd=sd(host_percentage, na.rm=T),
median=median(host_percentage, na.rm = TRUE),
IQR=IQR(host_percentage, na.rm = TRUE)) %>%
tt()
tax_group | mean | sd | median | IQR |
---|---|---|---|---|
Amphibians | 0.2968512 | 1.367301 | 0.02128969 | 0.05763236 |
Bats | 49.4004705 | 37.779131 | 41.34831756 | 86.63187989 |
Birds | 58.5039633 | 38.603463 | 70.50510687 | 81.48351233 |
Mammals | 29.5029544 | 36.464343 | 8.99683115 | 50.10072047 |
Reptiles | 12.4971018 | 22.008872 | 3.48495016 | 10.58589423 |
5.1.2 Statistical test
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
mutate(host_percentage= host_bases/bases_post_fastp*100) %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
glm(cbind(host_bases,bases_post_fastp) ~ sample_type + tax_group, data = .,family=quasibinomial) %>%
Anova(.,test="F",type="III") %>%
tidy()%>%
tt()
term | sumsq | df | F.values | p.value |
---|---|---|---|---|
sample_type | 4.573093e+11 | 1 | 508.6208 | 1.241797e-100 |
tax_group | 8.316384e+11 | 4 | 231.2377 | 1.420289e-163 |
Residuals | 1.815316e+12 | 2019 | NA | NA |
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
mutate(sample_type = factor(sample_type),tax_group=factor(tax_group)) %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
glm(cbind(host_bases,bases_post_fastp) ~ sample_type + tax_group, data = .,family=quasibinomial) %>%
glht(.,linfct = mcp(tax_group = "Tukey"))%>%
summary()%>%
tidy()%>%
tt()
term | contrast | null.value | estimate | std.error | statistic | adj.p.value |
---|---|---|---|---|---|---|
tax_group | Bats - Amphibians | 0 | 5.1412163 | 0.68642282 | 7.489868 | 3.935741e-13 |
tax_group | Birds - Amphibians | 0 | 4.9599164 | 0.68662378 | 7.223630 | 1.793010e-12 |
tax_group | Mammals - Amphibians | 0 | 4.4582788 | 0.68642685 | 6.494907 | 3.172528e-10 |
tax_group | Reptiles - Amphibians | 0 | 3.6999420 | 0.68797728 | 5.378000 | 6.455164e-07 |
tax_group | Birds - Bats | 0 | -0.1812999 | 0.05728337 | -3.164966 | 1.009358e-02 |
tax_group | Mammals - Bats | 0 | -0.6829374 | 0.05466064 | -12.494135 | 0.000000e+00 |
tax_group | Reptiles - Bats | 0 | -1.4412742 | 0.07106532 | -20.280979 | 0.000000e+00 |
tax_group | Mammals - Birds | 0 | -0.5016376 | 0.05073241 | -9.887911 | 0.000000e+00 |
tax_group | Reptiles - Birds | 0 | -1.2599743 | 0.07254355 | -17.368522 | 0.000000e+00 |
tax_group | Reptiles - Mammals | 0 | -0.7583368 | 0.07074971 | -10.718585 | 0.000000e+00 |
5.1.3 Plot
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
mutate(host_percentage= host_bases/bases_post_fastp*100) %>% #convert bases to gigabases (GB)
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>%
ggplot(., aes(y=host_percentage, x=tax_group, color=tax_group, fill=tax_group, group=tax_group)) +
geom_jitter(alpha = 0.2, width=0.3) +
geom_boxplot(outlier.shape = NA) +
scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
theme_classic() +
facet_grid(~sample_type) +
labs(y="Host percentage", color="Taxa", fill="Taxa") +
theme_classic()
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
mutate(host_percentage= host_bases/bases_post_fastp*100) %>% #convert bases to gigabases (GB)
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>%
ggplot(., aes(y=host_percentage, x=sample_type, group=sample_type)) +
stat_halfeye(adjust = 1, width = 0.5, .width = 0, justification = 0,normalize = "groups") +
theme_classic() +
labs(y="Host percentage", color="Taxa", fill="Taxa") +
theme_classic()
ggsave("figures/hostdata_taxa_all.pdf",width=9, height=4, units="in")
5.2 Genome depth
5.2.1 Data overview
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
left_join(read_tsv("data/reference.tsv"),by="reference_id") %>%
mutate(depth=host_bases/(reference_size*1000000)) %>% #convert bases to gigabases (GB)
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
group_by(sample_type) %>%
summarise(mean=mean(depth, na.rm=T),
sd=sd(depth, na.rm=T),
median=median(depth, na.rm = TRUE),
IQR=IQR(depth, na.rm = TRUE)) %>%
tt()
sample_type | mean | sd | median | IQR |
---|---|---|---|---|
Anal/cloacal swab | 2.4220586 | 2.079382 | 1.9656642 | 1.9114049 |
Faecal | 0.6573435 | 1.628490 | 0.1301568 | 0.6486201 |
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
left_join(read_tsv("data/reference.tsv"),by="reference_id") %>%
mutate(depth=host_bases/(reference_size*1000000)) %>% #convert bases to gigabases (GB)
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
group_by(tax_group) %>%
summarise(mean=mean(depth, na.rm=T),
sd=sd(depth, na.rm=T),
median=median(depth, na.rm = TRUE),
IQR=IQR(depth, na.rm = TRUE)) %>%
tt()
tax_group | mean | sd | median | IQR |
---|---|---|---|---|
Amphibians | 0.001921062 | 0.01314311 | 0.0000433766 | 0.0001050921 |
Bats | 1.246383229 | 1.39021316 | 0.9629904655 | 1.6487318021 |
Birds | 2.478493853 | 3.08305053 | 1.9449031457 | 2.7817309257 |
Mammals | 0.595230910 | 1.27474674 | 0.1327805697 | 0.6743426328 |
Reptiles | 0.345241814 | 0.60682463 | 0.1076937456 | 0.3145671900 |
5.2.2 Statistical test
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
left_join(read_tsv("data/reference.tsv"),by="reference_id") %>%
mutate(depth=host_bases/(reference_size*1000000)) %>% #convert bases to gigabases (GB)
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
lm(rank(depth) ~ sample_type + tax_group, data = .) %>%
Anova(type = "III") %>%
tidy()%>%
tt()
term | sumsq | df | statistic | p.value |
---|---|---|---|---|
(Intercept) | 45259288 | 1 | 255.3583 | 2.982539e-54 |
sample_type | 75036348 | 1 | 423.3640 | 1.215557e-85 |
tax_group | 208078989 | 4 | 293.5016 | 1.475300e-199 |
Residuals | 361920741 | 2042 | NA | NA |
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
left_join(read_tsv("data/reference.tsv"),by="reference_id") %>%
mutate(depth=host_bases/(reference_size*1000000)) %>% #convert bases to gigabases (GB)
mutate(sample_type = factor(sample_type),tax_group=factor(tax_group)) %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
lm(rank(depth) ~ sample_type + tax_group, data = .) %>%
glht(.,linfct = mcp(tax_group = "Tukey"))%>%
summary()%>%
tidy()%>%
tt()
term | contrast | null.value | estimate | std.error | statistic | adj.p.value |
---|---|---|---|---|---|---|
tax_group | Bats - Amphibians | 0 | 1208.81726 | 39.35163 | 30.718356 | 0.0000000 |
tax_group | Birds - Amphibians | 0 | 1131.10292 | 41.03771 | 27.562527 | 0.0000000 |
tax_group | Mammals - Amphibians | 0 | 717.21446 | 36.74653 | 19.517881 | 0.0000000 |
tax_group | Reptiles - Amphibians | 0 | 744.78843 | 37.68408 | 19.764006 | 0.0000000 |
tax_group | Birds - Bats | 0 | -77.71434 | 33.53182 | -2.317630 | 0.1351404 |
tax_group | Mammals - Bats | 0 | -491.60281 | 27.93331 | -17.599160 | 0.0000000 |
tax_group | Reptiles - Bats | 0 | -464.02884 | 29.08689 | -15.953197 | 0.0000000 |
tax_group | Mammals - Birds | 0 | -413.88846 | 28.68250 | -14.429998 | 0.0000000 |
tax_group | Reptiles - Birds | 0 | -386.31450 | 31.32238 | -12.333498 | 0.0000000 |
tax_group | Reptiles - Mammals | 0 | 27.57397 | 25.44895 | 1.083501 | 0.8102494 |
5.2.3 Plot
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
left_join(read_tsv("data/reference.tsv"),by="reference_id") %>%
mutate(depth=host_bases/(reference_size*1000000)) %>% #convert bases to gigabases (GB)
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>%
ggplot(., aes(y=depth, x=sample_type, color=sample_type, fill=sample_type, group=sample_type)) +
ylim(0,10)+
geom_boxplot(outlier.shape = NA) +
scale_color_manual(values = c("#bdca50", "#AA3377")) +
scale_fill_manual(values = c("#bdca5080", "#AA337780")) +
theme_classic() +
labs(y="Host depth of coverage", color="Sample type", fill="Sample type") +
theme_classic()