diff --git a/06-analyze/dilution-load.png b/06-analyze/dilution-load.png index fb2cb74..22317cf 100644 Binary files a/06-analyze/dilution-load.png and b/06-analyze/dilution-load.png differ diff --git a/06-analyze/logs/base-residuals.png b/06-analyze/logs/base-residuals.png index f155892..b31ac81 100644 Binary files a/06-analyze/logs/base-residuals.png and b/06-analyze/logs/base-residuals.png differ diff --git a/06-analyze/logs/full-residuals.png b/06-analyze/logs/full-residuals.png index ba25d62..83c3cc8 100644 Binary files a/06-analyze/logs/full-residuals.png and b/06-analyze/logs/full-residuals.png differ diff --git a/06-analyze/logs/fun-full-rand.png b/06-analyze/logs/fun-full-rand.png index 7baefc7..6158b45 100644 Binary files a/06-analyze/logs/fun-full-rand.png and b/06-analyze/logs/fun-full-rand.png differ diff --git a/06-analyze/logs/gi-full-rand.png b/06-analyze/logs/gi-full-rand.png index 0f5e4bf..f6325fd 100644 Binary files a/06-analyze/logs/gi-full-rand.png and b/06-analyze/logs/gi-full-rand.png differ diff --git a/06-analyze/residuals.png b/06-analyze/residuals.png index 7430ed5..993703b 100644 Binary files a/06-analyze/residuals.png and b/06-analyze/residuals.png differ diff --git a/README.md b/README.md index c80b179..bdd2d7a 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,18 @@ Kyle A. Gervers > This work is unpublished. All results shown are preliminary. -## What’s hamPCR? +# Sections + +- [hamPCR](#hampcr) +- [Overview](#overview) +- [Reproducibility](#reproducibility) +- [Installation](#installation) +- [Methods](#methods) +- [Results](#results) + +## hamPCR + +[Return](#sections) Host-associated microbe PCR (hamPCR) is a metabarcoding technique developed by [Derek Lundberg *et @@ -49,7 +60,9 @@ productivity-diversity relationships. hamPCR, similar to other microbial load while at the same time censusing the microbial community of interest. -## What is this project? +## Overview + +[Return](#sections) Following the protocol described by Lundberg *et al.*, a primer pair was designed (Gi) that amplfies a portion of *Gigantea* coding sequence in @@ -79,35 +92,39 @@ chemistry. ## Reproducibility +[Return](#sections) + All packages were installed and managed with `conda`. - ## conda 22.11.1 - ## name: base - ## channels: - ## - conda-forge - ## - bioconda - ## - defaults - ## dependencies: - ## - fastqc=0.11.9 - ## - r-base=4.2.2 - ## - multiqc=1.13 - ## - gh=2.21.2 - ## - pheniqs=2.1.0 - ## - libgit2=1.5.0 - ## - git=2.39.0 - ## - itsx=1.1.3 - ## - cutadapt=4.2 - ## - vsearch=2.22.1 - ## - bioconductor-dada2=1.26.0 - ## - r-remotes=2.4.2 - ## - r-dplyr=1.0.10 - ## - atropos=1.1.31 - ## - r-rmarkdown=2.20 - ## - r-markdown=1.4 - ## prefix: /home/gerverska/projects/fun-gi/env + conda 22.11.1 + name: base + channels: + - conda-forge + - bioconda + - defaults + dependencies: + - fastqc=0.11.9 + - r-base=4.2.2 + - multiqc=1.13 + - gh=2.21.2 + - pheniqs=2.1.0 + - libgit2=1.5.0 + - git=2.39.0 + - itsx=1.1.3 + - cutadapt=4.2 + - vsearch=2.22.1 + - bioconductor-dada2=1.26.0 + - r-remotes=2.4.2 + - r-dplyr=1.0.10 + - atropos=1.1.31 + - r-rmarkdown=2.20 + - r-markdown=1.4 + prefix: /home/gerverska/projects/fun-gi/env ## Installation +[Return](#sections) + Install the above bioinformatic environment from `config.yml` using the script `00-build.sh` @@ -119,42 +136,163 @@ script `00-build.sh` A `make` implementation is on the horizon… -## Figures +## Methods + +[Return](#sections) + +- [Demultiplex](#demultiplex) +- [Trim](#trim) +- [Denoise](#denoise) +- [Compile](#compile) +- [Rarefy](#rarefy) +- [Analyze](#analyze) + +Descriptions are ongoing. + +### Demultiplex + +### Trim + +### Denoise + +[Methods](#methods) + +![fun-error-fwd](03-denoise/logs/fun-error-fwd.png) + +![fun-error-rev](03-denoise/logs/fun-error-rev.png) + +![gi-error-fwd](03-denoise/logs/gi-error-fwd.png) + +![gi-error-rev](03-denoise/logs/gi-error-rev.png) + +### Compile + +[Methods](#methods) + +#### LULU post-clustering of Fun ASVs + -Descriptions are coming soon. + ####processing: fun_OTU.1 ##### + ---hits: fun_OTU.3 + ---hits: fun_OTU.4 + ---hits: fun_OTU.5 + ---potential parent: + No parent found! -### DADA2 error modeling + ####processing: fun_OTU.3 ##### + ---hits: fun_OTU.1 + ---hits: fun_OTU.4 + ---hits: fun_OTU.5 + ---potential parent: fun_OTU.1 + ------checking: fun_OTU.1 + ------relative cooccurence: 1 which is sufficient! + ------min avg abundance: 32.8 which is OK! + SETTING fun_OTU.3 to be an ERROR of fun_OTU.1 - - + ####processing: fun_OTU.2 ##### + ---hits: + ---potential parent: + No parent found! - - + ####processing: fun_OTU.4 ##### + ---hits: fun_OTU.1 + ---hits: fun_OTU.3 + ---potential parent: fun_OTU.1 + ---potential parent: fun_OTU.3 + ------checking: fun_OTU.1 + ------relative cooccurence: 1 which is sufficient! + ------min avg abundance: 103.945945945946 which is OK! + SETTING fun_OTU.4 to be an ERROR of fun_OTU.1 -### Sequencing depth along the NOGA:PSME DNA standard curve + ------checking: fun_OTU.3 + ####processing: fun_OTU.5 ##### + ---hits: fun_OTU.1 + ---hits: fun_OTU.3 + ---potential parent: fun_OTU.1 + ---potential parent: fun_OTU.3 + ------checking: fun_OTU.1 + ------relative cooccurence: 0 + ------checking: fun_OTU.3 + ------relative cooccurence: 0 + No parent found! + +#### LULU post-clustering of Gi ASVs + + + ####processing: gi_OTU.1 ##### + ---hits: gi_OTU.2 + ---hits: gi_OTU.3 + ---potential parent: + No parent found! + + ####processing: gi_OTU.2 ##### + ---hits: gi_OTU.1 + ---hits: gi_OTU.3 + ---potential parent: gi_OTU.1 + ------checking: gi_OTU.1 + ------relative cooccurence: 1 which is sufficient! + ------min avg abundance: 2.20670391061453 which is OK! + SETTING gi_OTU.2 to be an ERROR of gi_OTU.1 + + ####processing: gi_OTU.3 ##### + ---hits: gi_OTU.1 + ---hits: gi_OTU.2 + ---potential parent: gi_OTU.1 + ---potential parent: gi_OTU.2 + ------checking: gi_OTU.1 + ------relative cooccurence: 1 which is sufficient! + ------min avg abundance: 10.8850574712644 which is OK! + SETTING gi_OTU.3 to be an ERROR of gi_OTU.1 + + ------checking: gi_OTU.2 + +### Rarefy + +[Methods](#methods) ![dilution-bias](05-rarefy/logs/dilution-bias.png) -### Checking assumptions when modeling NOGA load against NOGA DNA dilution +### Analyze + +[Methods](#methods) + +#### Base model + +##### Full model residuals -#### Base model residuals + Regress NOGA load onto known NOGA dilution, without accounting for Fun + Gi frameshifts + + base <- lm(log10(mean_noga_load) ~ log10(dilution), standard) ![base-residuals](06-analyze/logs/base-residuals.png) -#### Full model residuals (a) and Fun (b) + Gi (c) frameshift random intercepts +#### Full model + + Regress NOGA load onto known NOGA dilution, treating Fun + Gi frameshifts as random effects + + library(nlme) + + full <- lme(log10(mean_noga_load) ~ log10(dilution), + random = list(fun_n = ~ 1, gi_n = ~ 1), + data = standard, + method = 'ML', + na.action = na.omit) + +##### Full model residuals + +![full-residuals](06-analyze/logs/full-residuals.png) + +##### Fun frameshift random intercepts + +![fun-full-rand](06-analyze/logs/fun-full-rand.png) + +##### Gi frameshift random intercepts (nested within Fun) + +![gi-full-rand](06-analyze/logs/gi-full-rand.png) -a b - c - +## Results -### NOGA load modeled against NOGA DNA dilution +### NOGA load regressed on NOGA dilution ![dilution-load](06-analyze/dilution-load.png) diff --git a/README.rmd b/README.rmd index a814b9f..87b6a7b 100644 --- a/README.rmd +++ b/README.rmd @@ -11,25 +11,41 @@ knitr::opts_chunk$set(echo = F, include = F) > This work is unpublished. All results shown are preliminary. -## What's hamPCR? +# Sections +* [hamPCR](#hampcr) +* [Overview](#overview) +* [Reproducibility](#reproducibility) +* [Installation](#installation) +* [Methods](#methods) +* [Results](#results) + +## hamPCR +[Return](#sections) + Host-associated microbe PCR (hamPCR) is a metabarcoding technique developed by [Derek Lundberg *et al.*](https://elifesciences.org/articles/66186) for quantifying microbial load in or on host tissues. Using a pair of host-specific primers to produce amplicons of a slightly different size (80-120 bp) relative to the target microbial amplicon, investigators can safely create a host reference internal to each sample that doesn't swamp the more important microbial signal. After sequencing, dividing the count of microbial reads in each sample by the count of host reads creates and index of microbial load that's comparable across samples. This microbial load index can also be examined for every microbial taxonomic unit, allowing investigators to examine which taxa have loads that are positively or negatively correlated with each other and which samples have more or less compositionally similar loads. Notably, this is an improvement compared to methods which use the relative abundances of taxonomic units to describe communities. Because relative abundances are always constrained to sum to one, a *real increase or decrease* in the abundance of any taxonomic feature necessitates an *artifactual decrease or increase* in the relative abundance of all other features (and no, the order of "increase" and "decrease" was not swapped here). This means that relative abundance-based analyses are unable to distinguish between scenarios in which one taxonomic feature increases in actual abundance or in which all other taxonomic features decrease. While this downside to relative abundance analyses does not represent an inherent shortcoming if microbial ecologists adjust their interpretations appropriately, this adjustment is likely more of a compromise than is currently appreciated. For example, most ecologists are interested in knowing how a taxon responds to a meaningful environmental gradient or treatment. If the relative abundance of a taxon generally increases as the gradient increases in intensity, it's unclear whether this result represents an outcome in which the taxon increasingly *prefers* a condition along a gradient or whether *other taxa simply prefer it less*. Further, unless microbial ecologists employ other means of estimating microbial load (qPCR, CFUs, cell-sorting, etc.), they generally cannot investigate fundamental phenomena centered around productivity, e.g., productivity-diversity relationships. hamPCR, similar to other "spike-in" approaches, represents a more achievable way to estimate microbial load while at the same time censusing the microbial community of interest. -## What is this project? +## Overview +[Return](#sections) + Following the protocol described by Lundberg *et al.*, a primer pair was designed (Gi) that amplfies a portion of *Gigantea* coding sequence in coastal Douglas-fir (*Pseudotsuga menziesii* var. *menziesii*, PSME). This amplicon is ~100 bp larger than most amplicons produced with the fungal-specific primers 5.8S-Fun and ITS4-Fun (Fun), designed by [D. Lee Taylor *et al.* 2016](https://journals.asm.org/doi/10.1128/AEM.02576-16). Using this size difference, the primer concentration of Gi primers was optimized. To test whether or not fungal load calculations derived from this technique were reliable, PCRs were performed along a standard curve of *Nothophaeocryptopus gaeumannii* (NOGA) DNA serially diluted in PSME DNA. Additionally, 8 samples of pooled needle DNA extractions (DFSSMT samples) were included to predict where an average sample might fall along the standard curve. Further, with the expectation of a future need to extend the number of barcode indices for multiplexing samples prior to sequencing, the 3-9 bp frameshift regions in each primer were incorporated into the multiplexing strategy. To account for the limited number of samples included on this run, frameshift combinations were randomly assigned to samples. Knowing that high indel rates could erode the fidelity of this multiplexing scheme, all pairwise combinations of forward and reverse frameshifts were used in demultiplexing. All samples were sequenced on an Illumina MiSeq using a 500-cycle (2x250 bp) Nano reagent kit with v2 chemistry. ## Reproducibility +[Return](#sections) + All packages were installed and managed with `conda`. -```{bash conda, include=TRUE, echo=FALSE} +```{bash conda, include=TRUE, echo=FALSE, comment=""} conda --version cat config.yml ``` ## Installation +[Return](#sections) + Install the above bioinformatic environment from `config.yml` using the script `00-build.sh` ``` # First clone the repository (using the gh CLI tool here) #### @@ -40,31 +56,101 @@ bash code/00-build.sh ``` A `make` implementation is on the horizon... -## Figures +## Methods +[Return](#sections) + +- [Demultiplex](#demultiplex) +- [Trim](#trim) +- [Denoise](#denoise) +- [Compile](#compile) +- [Rarefy](#rarefy) +- [Analyze](#analyze) + +Descriptions are ongoing. + +### Demultiplex + +### Trim + +### Denoise +[Methods](#methods) -Descriptions are coming soon. +![fun-error-fwd](03-denoise/logs/fun-error-fwd.png) -### DADA2 error modeling +![fun-error-rev](03-denoise/logs/fun-error-rev.png) -![fun-error-fwd](03-denoise/logs/fun-error-fwd.png){width=45%} ![fun-error-rev](03-denoise/logs/fun-error-rev.png){width=45%} +![gi-error-fwd](03-denoise/logs/gi-error-fwd.png) -![gi-error-fwd](03-denoise/logs/gi-error-fwd.png){width=45%} ![gi-error-rev](03-denoise/logs/gi-error-rev.png){width=45%} +![gi-error-rev](03-denoise/logs/gi-error-rev.png) -### Sequencing depth along the NOGA:PSME DNA standard curve +### Compile +[Methods](#methods) + +#### LULU post-clustering of Fun ASVs + +```{bash fun-compile, include=TRUE, echo=FALSE, comment=""} + +cat 04-compile/logs/fun-lulu.txt + +``` + +#### LULU post-clustering of Gi ASVs + +```{bash gi-compile, include=TRUE, echo=FALSE, comment=""} + +cat 04-compile/logs/gi-lulu.txt + +``` + +### Rarefy +[Methods](#methods) ![dilution-bias](05-rarefy/logs/dilution-bias.png) -### Checking assumptions when modeling NOGA load against NOGA DNA dilution +### Analyze +[Methods](#methods) + +#### Base model -#### Base model residuals +##### Full model residuals + +``` +Regress NOGA load onto known NOGA dilution, without accounting for Fun + Gi frameshifts + +base <- lm(log10(mean_noga_load) ~ log10(dilution), standard) +``` ![base-residuals](06-analyze/logs/base-residuals.png) -#### Full model residuals (a) and Fun (b) + Gi (c) frameshift random intercepts +#### Full model + +``` +Regress NOGA load onto known NOGA dilution, treating Fun + Gi frameshifts as random effects + +library(nlme) + +full <- lme(log10(mean_noga_load) ~ log10(dilution), + random = list(fun_n = ~ 1, gi_n = ~ 1), + data = standard, + method = 'ML', + na.action = na.omit) +``` + +##### Full model residuals + +![full-residuals](06-analyze/logs/full-residuals.png) + +##### Fun frameshift random intercepts + +![fun-full-rand](06-analyze/logs/fun-full-rand.png) + +##### Gi frameshift random intercepts (nested within Fun) + +![gi-full-rand](06-analyze/logs/gi-full-rand.png) -a ![full-residuals](06-analyze/logs/full-residuals.png){width=30%} b ![fun-full-rand](06-analyze/logs/fun-full-rand.png){width=30%} c ![gi-full-rand](06-analyze/logs/gi-full-rand.png){width=30%} +## Results -### NOGA load modeled against NOGA DNA dilution +### NOGA load regressed on NOGA dilution ![dilution-load](06-analyze/dilution-load.png) diff --git a/code/06-analyze.r b/code/06-analyze.r index cfb8f16..c813fce 100644 --- a/code/06-analyze.r +++ b/code/06-analyze.r @@ -130,10 +130,44 @@ shifts <- ggplot(standard, aes(x = fun_n, y = gi_n, color = res)) + file.path(out, 'residuals.png') |> ggsave(shifts, width = 9, height = 6) # Comparing relative abundance and load metrics #### -fun.tab <- standard[, colnames(standard)[grepl('OTU', colnames(standard)) == T]] -fun.tab <- fun.tab[, colSums(fun.tab) > 0] +fun.tab <- subset(meta, template == 'dfssmt', + select = colnames(meta)[grepl('fun_OTU', colnames(meta)) == T]) +gi.reads <- subset(meta, template == 'dfssmt', + select = colnames(meta)[grepl('gi_OTU', colnames(meta)) == T]) |> rowSums() +combo <- meta[meta$template == 'dfssmt' , 'combo'] + +fun.tab <- subset(fun.tab, select = c((colSums(fun.tab)) > 0)) +fun.otus <- colnames(fun.tab) + +fun.ra <- fun.tab / rowSums(fun.tab) +fun.ra$stat <- 'Relative abundance' +fun.load <- (fun.tab / gi.reads)^(1/4) +fun.load$stat <- 'Log-transformed load' +fun.both <- rbind(fun.ra, fun.load) +fun.both$combo <- rownames(fun.ra) |> rep(2) + +long <- reshape(fun.both, + direction = 'long', varying = fun.otus, + v.names = 'value', times = fun.otus, + timevar = 'OTU', idvar = c('combo', 'stat')) +long$OTU <- long$OTU |> gsub('fun_', '', x = _) + +compare <- ggplot(long, aes(x = combo, y = value, fill = OTU)) + + geom_bar(stat = 'identity') + + facet_grid(rows = vars(stat), scales = 'free') + + scale_y_continuous(n.breaks = 6) + + xlab("\nDFSSMT sample") + + ylab("Value\n") + + theme_classic() + + theme(text = element_text(size = 14), + axis.title.y = element_blank(), + axis.text.x = element_blank(), + axis.ticks.x = element_blank()) +theme(text = element_text(size = 14), + axis.text.x = element_text(angle = 45, vjust = 0.5), + axis.title.x = element_text(face = 'bold'), + axis.title.y = element_text(face = 'bold'), + legend.title = element_text(face = 'bold')) -tax <- fun.gi$tax[rownames(fun.gi$tax) %in% colnames(fun.tab), ] |> t() |> data.frame() +test$value |> quantile() -fun.tab <- standard |> subset(select = colnames(standard)[grepl('OTU', colnames(standard)) == T]) -fun.tab <- standard |> subset(select = colnames(standard)[grepl('OTU', colnames(standard)) == T])