How to prepare a custom training set for DECIPHER::IdTaxa?
These are from the DECIPHER package.

Library

library(DECIPHER)

Read the fasta file with 16S rRNA gene sequences for taxa in mock community.

# Here example is store in the package.
db <- system.file("extdata", "ZymoDb.fasta",
                      package="chkMocks", mustWork = TRUE)
#path for file
db

Convert to DNAStringSet.

seqs <- Biostrings::readDNAStringSet(db)
seqs <- OrientNucleotides(seqs)
# check first 2 as example
names(seqs)[1:2]

Now, adding the dummy seq name before > and adding a ‘Root’ before Bacteria;Phylum;etc;

# here, adding the dummy seq name before > and adding a 'Root' before Bacteria;Phylum;etc;
names(seqs) <- paste0("ZymoSeq", seq(1:length(names(seqs))), " ", "Root;" ,names(seqs))

# check first 2 as example
names(seqs)[1:2]

We show how to check for problematic taxonomies. However, it will most likely be not required if input fasta is formatted correctly.

groups <- names(seqs) # sequence names
groups <- gsub("(.*)(Root;)", "\\2", groups)
groupCounts <- table(groups)
u_groups <- names(groupCounts)
length(u_groups)
maxGroupSize <- 10 # max sequences per label (>= 1)
remove <- logical(length(seqs))

Create a training set.

maxIterations <- 3
allowGroupRemoval <- FALSE
probSeqsPrev <- integer()

for (i in which(groupCounts > maxGroupSize)) {
  index <- which(groups==u_groups[i])
  keep <- sample(length(index),
                 maxGroupSize)
  remove[index[-keep]] <- TRUE
}
sum(remove)

taxid <- NULL
for (i in seq_len(maxIterations)) {
  cat("Training iteration: ", i, "\n", sep="")
  # train the classifier
  ZymoTrainingSet <- LearnTaxa(seqs[!remove],
                           names(seqs)[!remove],
                           taxid)
  # look for problem sequences
  probSeqs <- ZymoTrainingSet$problemSequences$Index
  if (length(probSeqs)==0) {
    cat("No problem sequences remaining.\n")
    break
  } else if (length(probSeqs)==length(probSeqsPrev) &&
             all(probSeqsPrev==probSeqs)) {
    cat("Iterations converged.\n")
    break
  }
  if (i==maxIterations)
    break
  probSeqsPrev <- probSeqs
  # remove any problem sequences
  index <- which(!remove)[probSeqs]
  remove[index] <- TRUE # remove all problem sequences
  if (!allowGroupRemoval) {
    # replace any removed groups
    missing <- !(u_groups %in% groups[!remove])
    missing <- u_groups[missing]
    if (length(missing) > 0) {
      index <- index[groups[index] %in% missing]
      remove[index] <- FALSE # don't remove
    }
  }
}

Check any problems. We do not have it in this example.

sum(remove)
length(probSeqs)

Check training set.

ZymoTrainingSet
# for internal
#usethis::use_data(ZymoTrainingSet, overwrite = TRUE, compress = "xz")
 plot(ZymoTrainingSet)
devtools::session_info()