3 General statistics
3.1 Sample statistics
3.1.1 All
read_tsv("data/sample.tsv") %>%
summarise(specimens=n_distinct(specimen_id),
species=n_distinct(specimen_species),
orders=n_distinct(specimen_order),
families=n_distinct(specimen_family)) %>%
tt()
specimens | species | orders | families |
---|---|---|---|
4364 | 244 | 22 | 67 |
3.1.2 Subset
Faecal and cloacal swab samples employed in the study.
inner_join(read_tsv("data/sample.tsv"),
read_tsv("data/extraction.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
summarise(specimens=n_distinct(specimen_id),
species=n_distinct(specimen_species),
orders=n_distinct(specimen_order),
families=n_distinct(specimen_family),
swabs=n_distinct(sample_id[sample_type == "Anal/cloacal swab"]),
faeces=n_distinct(sample_id[sample_type == "Faecal"])) %>%
tt()
specimens | species | orders | families | swabs | faeces |
---|---|---|---|---|---|
2025 | 151 | 17 | 54 | 442 | 1824 |
3.1.3 Origin of samples (Figure S1)
read_tsv("data/sample.tsv") %>%
#subset columns
dplyr::select(
sample_id,
specimen_species,
specimen_order,
specimen_class,
capture_latitude,
capture_longitude,
tax_group
) %>%
#Add jitter to points
mutate(
capture_latitude_jitter=capture_latitude+rnorm(length(capture_latitude), mean=0, sd=0.5),
capture_longitude_jitter=capture_longitude+rnorm(length(capture_longitude), mean=0, sd=0.5),
) %>%
mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>%
#Plot map
ggplot() +
geom_map(
data=map_data("world"),
map = map_data("world"),
aes(long, lat, map_id=region),
color = "white", fill = "#cccccc", size = 0.2
) +
geom_point(
aes(x=capture_longitude_jitter,y=capture_latitude_jitter, color=tax_group),
alpha=0.5, size=0.5, shape=16) +
scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
labs(color="Taxonomic order") +
theme_minimal() +
theme(
axis.title.x=element_blank(),
axis.title.y=element_blank(),
legend.position = "bottom")
3.2 Data statistics
3.2.1 Total data
left_join(read_tsv("data/preprocessing.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
mutate(bases_pre_fastp = bases_pre_fastp / 1000000000) %>% #convert bases to gigabases (GB)
summarise(
total= sum(bases_pre_fastp, na.rm = TRUE),
swabs=sum(bases_pre_fastp[sample_type == "Anal/cloacal swab"], na.rm = TRUE),
faeces=sum(bases_pre_fastp[sample_type == "Faecal"], na.rm = TRUE),
swabs_n=n_distinct(preprocessing_id[sample_type == "Anal/cloacal swab"], na.rm = TRUE),
faeces_n=n_distinct(preprocessing_id[sample_type == "Faecal"], na.rm = TRUE),
mean= mean(bases_pre_fastp, na.rm = TRUE),
sd = sd(bases_pre_fastp, na.rm = TRUE),
median=median(bases_pre_fastp, na.rm = TRUE),
IQR=IQR(bases_pre_fastp, na.rm = TRUE)
) %>%
tt()
total | swabs | faeces | swabs_n | faeces_n | mean | sd | median | IQR |
---|---|---|---|---|---|---|---|---|
11262.32 | 1998.166 | 9264.149 | 345 | 1702 | 5.561637 | 4.804119 | 4.918368 | 2.697444 |
3.2.2 Quality-filtered data
read_tsv("data/preprocessing.tsv") %>%
mutate(bases_post_fastp = bases_post_fastp / 1000000000) %>% #convert bases to gigabases (GB)
summarise(
total= sum(bases_post_fastp, na.rm = TRUE),
mean= mean(bases_post_fastp, na.rm = TRUE),
sd = sd(bases_post_fastp, na.rm = TRUE),
median=median(bases_post_fastp, na.rm = TRUE),
IQR=IQR(bases_post_fastp, na.rm = TRUE)
) %>%
tt()
total | mean | sd | median | IQR |
---|---|---|---|---|
13486.81 | 5.377518 | 4.586315 | 4.659982 | 2.81969 |
3.2.3 Host genomic data
read_tsv("data/preprocessing.tsv") %>%
mutate(host_bases = host_bases / 1000000000) %>% #convert bases to gigabases (GB)
summarise(
total= sum(host_bases, na.rm = TRUE),
mean= mean(host_bases, na.rm = TRUE),
sd = sd(host_bases, na.rm = TRUE),
median=median(host_bases, na.rm = TRUE),
IQR=IQR(host_bases, na.rm = TRUE)
) %>%
tt()
total | mean | sd | median | IQR |
---|---|---|---|---|
5554.962 | 2.19477 | 3.711089 | 0.731476 | 3.37658 |
3.2.4 Metagenomic data
read_tsv("data/preprocessing.tsv") %>%
mutate(metagenomic_bases = metagenomic_bases / 1000000000) %>% #convert bases to gigabases (GB)
summarise(
total= sum(metagenomic_bases, na.rm = TRUE),
mean= mean(metagenomic_bases, na.rm = TRUE),
sd = sd(metagenomic_bases, na.rm = TRUE),
median=median(metagenomic_bases, na.rm = TRUE),
IQR=IQR(metagenomic_bases, na.rm = TRUE)
) %>%
tt()
total | mean | sd | median | IQR |
---|---|---|---|---|
7931.853 | 3.133881 | 3.272415 | 2.964926 | 3.881528 |
3.2.5 Assemblies
left_join(read_tsv("data/assembly.tsv"),
read_tsv("data/preprocessing.tsv"),
by="preprocessing_id") %>%
left_join(read_tsv("data/sample.tsv"),by="sample_id") %>%
group_by(assembly_type) %>%
summarise(assembly_n=n_distinct(assembly_id),
swabs_n=n_distinct(assembly_id[sample_type == "Anal/cloacal swab"], na.rm = TRUE),
faeces_n=n_distinct(assembly_id[sample_type == "Faecal"], na.rm = TRUE),
swabs_size=sum(assembly_length[sample_type == "Anal/cloacal swab"], na.rm = TRUE),
faeces_size=sum(assembly_length[sample_type == "Faecal"], na.rm = TRUE)) %>%
tt()
assembly_type | assembly_n | swabs_n | faeces_n | swabs_size | faeces_size |
---|---|---|---|---|---|
Coassembly | 294 | 41 | 227 | 20029149172 | 916263568901 |
Individual | 1722 | 177 | 1377 | 2337823366 | 132226412145 |
Multisplit | 1 | 0 | 1 | 0 | 0 |
NA | 1 | 0 | 1 | 0 | 0 |
3.2.6 MAGs
left_join(read_tsv("data/mag.tsv"),
read_tsv("data/assembly.tsv"),
by="assembly_id") %>%
left_join(read_tsv("data/preprocessing.tsv"),by="preprocessing_id") %>%
left_join(read_tsv("data/sample.tsv"),by="sample_id") %>%
dplyr::select(mag_id,mag_phylum,mag_completeness,mag_contamination,sample_type)%>%
unique() %>%
summarise(number=n(),
swabs_n=n_distinct(mag_id[sample_type == "Anal/cloacal swab"], na.rm = TRUE),
faeces_n=n_distinct(mag_id[sample_type == "Faecal"], na.rm = TRUE),
phylums=n_distinct(mag_phylum),
completeness=mean(mag_completeness),
contamination=mean(mag_contamination)) %>%
tt()
number | swabs_n | faeces_n | phylums | completeness | contamination |
---|---|---|---|---|---|
51690 | 1896 | 47757 | 42 | 83.52126 | 2.000393 |