3 General statistics

3.1 Sample statistics

3.1.1 All

read_tsv("data/sample.tsv") %>%
  summarise(specimens=n_distinct(specimen_id),
            species=n_distinct(specimen_species),
            orders=n_distinct(specimen_order),
            families=n_distinct(specimen_family)) %>% 
  tt()
tinytable_cmwbx9frmwo3fsf4havn
specimens species orders families
4364 244 22 67

3.1.2 Subset

Faecal and cloacal swab samples employed in the study.

inner_join(read_tsv("data/sample.tsv"),
          read_tsv("data/extraction.tsv"),
          by="sample_id") %>% 
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  summarise(specimens=n_distinct(specimen_id),
            species=n_distinct(specimen_species),
            orders=n_distinct(specimen_order),
            families=n_distinct(specimen_family),
            swabs=n_distinct(sample_id[sample_type == "Anal/cloacal swab"]),
            faeces=n_distinct(sample_id[sample_type == "Faecal"])) %>% 
  tt()
tinytable_lzrlaft7ge166un5mr2n
specimens species orders families swabs faeces
2025 151 17 54 442 1824

3.1.3 Origin of samples (Figure S1)

read_tsv("data/sample.tsv") %>%
  #subset columns
  dplyr::select(
    sample_id,
    specimen_species,
    specimen_order,
    specimen_class,
    capture_latitude,
    capture_longitude,
    tax_group
  ) %>%
  #Add jitter to points
  mutate(
    capture_latitude_jitter=capture_latitude+rnorm(length(capture_latitude), mean=0, sd=0.5),
    capture_longitude_jitter=capture_longitude+rnorm(length(capture_longitude), mean=0, sd=0.5),
  ) %>%
  mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>% 
  #Plot map  
  ggplot() +
    geom_map(
      data=map_data("world"),
      map = map_data("world"),
      aes(long, lat, map_id=region),
      color = "white", fill = "#cccccc", size = 0.2
    ) +
    geom_point(
      aes(x=capture_longitude_jitter,y=capture_latitude_jitter, color=tax_group),
      alpha=0.5, size=0.5, shape=16) +
    scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
    labs(color="Taxonomic order") +
    theme_minimal() +
    theme(
      axis.title.x=element_blank(),
      axis.title.y=element_blank(),
      legend.position = "bottom")

3.2 Data statistics

3.2.1 Total data

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  mutate(bases_pre_fastp = bases_pre_fastp / 1000000000)  %>% #convert bases to gigabases (GB)
  summarise(
    total= sum(bases_pre_fastp, na.rm = TRUE),
    swabs=sum(bases_pre_fastp[sample_type == "Anal/cloacal swab"], na.rm = TRUE),
    faeces=sum(bases_pre_fastp[sample_type == "Faecal"], na.rm = TRUE),
    swabs_n=n_distinct(preprocessing_id[sample_type == "Anal/cloacal swab"], na.rm = TRUE),
    faeces_n=n_distinct(preprocessing_id[sample_type == "Faecal"], na.rm = TRUE),
    mean= mean(bases_pre_fastp, na.rm = TRUE),
    sd = sd(bases_pre_fastp, na.rm = TRUE),
    median=median(bases_pre_fastp, na.rm = TRUE),
    IQR=IQR(bases_pre_fastp, na.rm = TRUE)
  ) %>%
  tt()
tinytable_lyy1s2lj7zxlvsy7q0jw
total swabs faeces swabs_n faeces_n mean sd median IQR
11262.32 1998.166 9264.149 345 1702 5.561637 4.804119 4.918368 2.697444

3.2.2 Quality-filtered data

read_tsv("data/preprocessing.tsv") %>%
  mutate(bases_post_fastp = bases_post_fastp / 1000000000)  %>% #convert bases to gigabases (GB)
  summarise(
    total= sum(bases_post_fastp, na.rm = TRUE),
    mean= mean(bases_post_fastp, na.rm = TRUE),
    sd = sd(bases_post_fastp, na.rm = TRUE),
    median=median(bases_post_fastp, na.rm = TRUE),
    IQR=IQR(bases_post_fastp, na.rm = TRUE)
  ) %>%
  tt()
tinytable_ctihtt0r4n47zn1eo1ns
total mean sd median IQR
13486.81 5.377518 4.586315 4.659982 2.81969

3.2.3 Host genomic data

read_tsv("data/preprocessing.tsv") %>%
  mutate(host_bases = host_bases / 1000000000)  %>% #convert bases to gigabases (GB)
  summarise(
    total= sum(host_bases, na.rm = TRUE),
    mean= mean(host_bases, na.rm = TRUE),
    sd = sd(host_bases, na.rm = TRUE),
    median=median(host_bases, na.rm = TRUE),
    IQR=IQR(host_bases, na.rm = TRUE)
  ) %>%
  tt()
tinytable_r9mgrf6u5n7frfqpsex1
total mean sd median IQR
5554.962 2.19477 3.711089 0.731476 3.37658

3.2.4 Metagenomic data

read_tsv("data/preprocessing.tsv") %>%
  mutate(metagenomic_bases = metagenomic_bases / 1000000000)  %>% #convert bases to gigabases (GB)
  summarise(
    total= sum(metagenomic_bases, na.rm = TRUE),
    mean= mean(metagenomic_bases, na.rm = TRUE),
    sd = sd(metagenomic_bases, na.rm = TRUE),
    median=median(metagenomic_bases, na.rm = TRUE),
    IQR=IQR(metagenomic_bases, na.rm = TRUE)
  ) %>%
  tt()
tinytable_9u43kn5wybo5hf1fde71
total mean sd median IQR
7931.853 3.133881 3.272415 2.964926 3.881528

3.2.5 Assemblies

left_join(read_tsv("data/assembly.tsv"),
          read_tsv("data/preprocessing.tsv"),
          by="preprocessing_id") %>%
  left_join(read_tsv("data/sample.tsv"),by="sample_id") %>% 
  group_by(assembly_type) %>% 
  summarise(assembly_n=n_distinct(assembly_id),
            swabs_n=n_distinct(assembly_id[sample_type == "Anal/cloacal swab"], na.rm = TRUE),
    faeces_n=n_distinct(assembly_id[sample_type == "Faecal"], na.rm = TRUE),
    swabs_size=sum(assembly_length[sample_type == "Anal/cloacal swab"], na.rm = TRUE),
    faeces_size=sum(assembly_length[sample_type == "Faecal"], na.rm = TRUE)) %>%
  tt()
tinytable_iwvn078tr0eqamoi6gkp
assembly_type assembly_n swabs_n faeces_n swabs_size faeces_size
Coassembly 294 41 227 20029149172 916263568901
Individual 1722 177 1377 2337823366 132226412145
Multisplit 1 0 1 0 0
NA 1 0 1 0 0

3.2.6 MAGs

left_join(read_tsv("data/mag.tsv"),
          read_tsv("data/assembly.tsv"),
          by="assembly_id") %>%
    left_join(read_tsv("data/preprocessing.tsv"),by="preprocessing_id") %>% 
    left_join(read_tsv("data/sample.tsv"),by="sample_id") %>% 
    dplyr::select(mag_id,mag_phylum,mag_completeness,mag_contamination,sample_type)%>% 
    unique() %>% 
    summarise(number=n(),
            swabs_n=n_distinct(mag_id[sample_type == "Anal/cloacal swab"], na.rm = TRUE),
            faeces_n=n_distinct(mag_id[sample_type == "Faecal"], na.rm = TRUE),
            phylums=n_distinct(mag_phylum),
            completeness=mean(mag_completeness),
            contamination=mean(mag_contamination)) %>% 
  tt()
tinytable_tweeou9qxra2pv8y9kb0
number swabs_n faeces_n phylums completeness contamination
51690 1896 47757 42 83.52126 2.000393