Load data
# Load data
pseudobulk =
here("data/3_prime_batch_1/preprocessing_results/pseudobulk_preprocessing/pseudobulk_preprocessing_sample_output.rds") |>
readRDS()
Analysis for RNA
Here to check that the input counts have not global sequencing-depth effect
data_for_pca |>
ggplot(aes(count_scaled + 1, color=.sample)) + geom_density() + scale_x_log10()
Calculate PCA of pseudobulk
metadata =
data_for_pca |>
pivot_sample() |>
select(.sample, donor, simple_timepoint, severity, days_since_symptom_onset, COVID, TMM)
metadata = as.data.frame(metadata)
rownames(metadata) = metadata$`.sample`
metadata = metadata[,-1]
my_pca =
data_for_pca@assays@data$count_scaled |>
log1p() |>
scale() |>
pca(metadata = metadata)
Calculate explained variance of principal components. This gives an idea of how many principal components we need to represent the data faithfully. In this case we see a gradual decrease of variance explained, indicating that we might need up to principal component 20 for explaining 90% of the variance.
my_pca |> screeplot()
Here we see which variable is associated with which principal component. We hope the biological variable are associated with the top principal components.
= indeed we see COVID and severity associated with the first principal component, COVID and days since symptom onset associated with the second principal component, and TMM associated with the third principal component together with simple time points. These gifts as hope that the biological effects have large magnitude.
my_pca |>
eigencorplot(metavars = colnames(metadata))
## Warning in eigencorplot(my_pca, metavars = colnames(metadata)): donor is not
## numeric - please check the source data as non-numeric variables will be coerced
## to numeric
## Warning in eigencorplot(my_pca, metavars = colnames(metadata)): simple_timepoint
## is not numeric - please check the source data as non-numeric variables will be
## coerced to numeric
## Warning in eigencorplot(my_pca, metavars = colnames(metadata)): severity is not
## numeric - please check the source data as non-numeric variables will be coerced
## to numeric
## Warning in eigencorplot(my_pca, metavars = colnames(metadata)): COVID is not
## numeric - please check the source data as non-numeric variables will be coerced
## to numeric
Here we see for the distribution of samples coloured by severity along the top principal components.
my_pca$rotated |>
cbind(my_pca$metadata) |>
as_tibble(rownames = ".sample") |>
GGally::ggpairs(columns = 2:15, ggplot2::aes(colour=`severity`))
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
Here we see for the distribution of samples coloured by days_since_symptom_onset along the top principal components.
metadata_with_days =
metadata |>
filter(!is.na(days_since_symptom_onset))
data_for_pca_days =
data_for_pca |>
filter(!is.na(days_since_symptom_onset))
my_pca_days =
filter(data_for_pca_days, !is.na(days_since_symptom_onset))@assays@data$count_scaled |>
log1p() |>
scale() |>
pca(metadata = metadata_with_days)
my_pca_days$rotated |>
cbind(my_pca_days$metadata) |>
as_tibble(rownames = ".sample") |>
mutate(days_since_symptom_onset = if_else(days_since_symptom_onset>median(days_since_symptom_onset), "late", "early")) |>
GGally::ggpairs(columns = 2:15, ggplot2::aes(colour=`days_since_symptom_onset`))
my_pca_days |>
eigencorplot(metavars = colnames(metadata_with_days |> select(-COVID)))
## Warning in eigencorplot(my_pca_days, metavars =
## colnames(select(metadata_with_days, : donor is not numeric - please check the
## source data as non-numeric variables will be coerced to numeric
## Warning in eigencorplot(my_pca_days, metavars =
## colnames(select(metadata_with_days, : simple_timepoint is not numeric - please
## check the source data as non-numeric variables will be coerced to numeric
## Warning in eigencorplot(my_pca_days, metavars =
## colnames(select(metadata_with_days, : severity is not numeric - please check the
## source data as non-numeric variables will be coerced to numeric
my_pca_days |>
biplot(colby="TMM")
my_pca_days |>
biplot(colby="donor")
Analysis of antibodies
Here to check that the input counts have not global sequencing-depth effect
data_for_pca |>
ggplot(aes(count_scaled + 1, color=.sample)) + geom_density() + scale_x_log10()
Calculate PCA of pseudobulk
metadata =
data_for_pca |>
pivot_sample() |>
select(.sample, donor, simple_timepoint, severity, days_since_symptom_onset, COVID, TMM) |>
replace_na(replace = list(days_since_symptom_onset = 0))
metadata = as.data.frame(metadata)
rownames(metadata) = metadata$`.sample`
metadata = metadata[,-1]
my_pca =
data_for_pca@assays@data$count_scaled |>
log1p() |>
scale() |>
pca(metadata = metadata)
Calculate explained variance of principal components. This gives an idea of how many principal components we need to represent the data faithfully. In this case we see a gradual decrease of variance explained, indicating that we might need up to printipal component 20 for explaining 90% of the variance.
my_pca |> screeplot()
Here we see which variable is associated with which principal component. We hope the biological variable are associated with the top principal components.
= indeed we see COVID and severity associated with the first principal component, COVID and days since symptom onset associated with the second principal component, and TMM associated with the third principal component together with simple time points. These gifts as hope that the biological effects have large magnitude.
my_pca |>
eigencorplot(metavars = colnames(metadata))
## Warning in eigencorplot(my_pca, metavars = colnames(metadata)): donor is not
## numeric - please check the source data as non-numeric variables will be coerced
## to numeric
## Warning in eigencorplot(my_pca, metavars = colnames(metadata)): simple_timepoint
## is not numeric - please check the source data as non-numeric variables will be
## coerced to numeric
## Warning in eigencorplot(my_pca, metavars = colnames(metadata)): severity is not
## numeric - please check the source data as non-numeric variables will be coerced
## to numeric
## Warning in eigencorplot(my_pca, metavars = colnames(metadata)): COVID is not
## numeric - please check the source data as non-numeric variables will be coerced
## to numeric
Here we see for the distribution of samples coloused by severity along the top principal components.
my_pca$rotated |>
cbind(my_pca$metadata) |>
as_tibble(rownames = ".sample") |>
GGally::ggpairs(columns = 2:15, ggplot2::aes(colour=`severity`))
Here we see for the distribution of samples coloused by days_since_symptom_onset along the top principal components.
my_pca$rotated |>
cbind(my_pca$metadata) |>
as_tibble(rownames = ".sample") |>
mutate(days_since_symptom_onset = if_else(days_since_symptom_onset>median(days_since_symptom_onset), "late", "early")) |>
GGally::ggpairs(columns = 2:15, ggplot2::aes(colour=`days_since_symptom_onset`))
my_pca |>
biplot(colby="TMM")
my_pca |>
biplot(colby="donor")
Analysis of covariate redundancy
metadata
## donor simple_timepoint severity days_since_symptom_onset
## 0483-002_NA 0483-002 NA moderate 0
## 0483-002_T1 0483-002 T1 moderate 5
## 0483-002_T2 0483-002 T2 moderate 8
## 0483-002_T3 0483-002 T3 moderate 34
## 0483-003_NA 0483-003 NA moderate 0
## 0483-003_T2 0483-003 T2 moderate 7
## 0483-003_T3 0483-003 T3 moderate 39
## 1637-W-003_NA 1637-W-003 NA severe 0
## 1637-W-003_T2 1637-W-003 T2 severe 11
## 1637-W-003_T3 1637-W-003 T3 severe 17
## 482-W-032_NA 482-W-032 NA severe 0
## 482-W-032_T1 482-W-032 T1 severe 33
## 482-W-032_T2 482-W-032 T2 severe 37
## 482-W-032_T3 482-W-032 T3 severe 39
## 482-W-034_NA 482-W-034 NA severe 0
## 482-W-034_T1 482-W-034 T1 severe 9
## 482-W-034_T2 482-W-034 T2 severe 11
## 482-W-034_T3 482-W-034 T3 severe 15
## 482-W-035_NA 482-W-035 NA moderate 0
## 482-W-035_T1 482-W-035 T1 moderate 3
## 482-W-035_T2 482-W-035 T2 moderate 5
## 482-W-035_T3 482-W-035 T3 moderate 9
## VBDR1013_NA VBDR1013 NA NA 0
## VBDR1013_none VBDR1013 none NA 0
## VBDR1140_NA VBDR1140 NA NA 0
## VBDR1140_none VBDR1140 none NA 0
## VBDR563_NA VBDR563 NA NA 0
## VBDR563_none VBDR563 none NA 0
## VBDR813_NA VBDR813 NA NA 0
## VBDR813_none VBDR813 none NA 0
## VBDR821_NA VBDR821 NA NA 0
## VBDR821_none VBDR821 none NA 0
## VBDR997_NA VBDR997 NA NA 0
## VBDR997_none VBDR997 none NA 0
## COVID TMM
## 0483-002_NA TRUE 1.0074424
## 0483-002_T1 TRUE 1.0255025
## 0483-002_T2 TRUE 1.0156813
## 0483-002_T3 TRUE 1.0106802
## 0483-003_NA TRUE 0.7986725
## 0483-003_T2 TRUE 1.0774194
## 0483-003_T3 TRUE 0.8916291
## 1637-W-003_NA TRUE 0.7399618
## 1637-W-003_T2 TRUE 0.8664717
## 1637-W-003_T3 TRUE 1.0360248
## 482-W-032_NA TRUE 0.9160799
## 482-W-032_T1 TRUE 1.0076738
## 482-W-032_T2 TRUE 1.0242726
## 482-W-032_T3 TRUE 1.0082499
## 482-W-034_NA TRUE 0.8628639
## 482-W-034_T1 TRUE 0.9733934
## 482-W-034_T2 TRUE 1.0188914
## 482-W-034_T3 TRUE 1.0121775
## 482-W-035_NA TRUE 0.9916354
## 482-W-035_T1 TRUE 1.0499111
## 482-W-035_T2 TRUE 1.1296713
## 482-W-035_T3 TRUE 1.0691957
## VBDR1013_NA FALSE 1.1674736
## VBDR1013_none FALSE 1.1313864
## VBDR1140_NA FALSE 1.1174852
## VBDR1140_none FALSE 1.0329846
## VBDR563_NA FALSE 1.0694319
## VBDR563_none FALSE 1.0488852
## VBDR813_NA FALSE 1.0065996
## VBDR813_none FALSE 1.1475235
## VBDR821_NA FALSE 0.9507530
## VBDR821_none FALSE 1.0278402
## VBDR997_NA FALSE 0.8730673
## VBDR997_none FALSE 1.0570607
require(tidyverse)
require(rcompanion)
## Loading required package: rcompanion
require(corrr)
## Loading required package: corrr
##
## Attaching package: 'corrr'
## The following object is masked from 'package:tidybulk':
##
## as_matrix
# Calculate a pairwise association between all variables in a data-frame. In particular nominal vs nominal with Chi-square, numeric vs numeric with Pearson correlation, and nominal vs numeric with ANOVA.
# Adopted from https://stackoverflow.com/a/52557631/590437
mixed_assoc = function(df, cor_method="spearman", adjust_cramersv_bias=TRUE){
# https://stackoverflow.com/questions/52554336/plot-the-equivalent-of-correlation-matrix-for-factors-categorical-data-and-mi
df_comb = expand.grid(names(df), names(df), stringsAsFactors = F) %>% set_names(c("X1", "X2"))
is_nominal = function(x) class(x) %in% c("factor", "character")
# https://community.rstudio.com/t/why-is-purr-is-numeric-deprecated/3559
# https://github.com/r-lib/rlang/issues/781
is_numeric <- function(x) { is.integer(x) || is_double(x)}
f = function(xName,yName) {
x = pull(df, xName)
y = pull(df, yName)
result = if(is_nominal(x) && is_nominal(y)){
# use bias corrected cramersV as described in https://rdrr.io/cran/rcompanion/man/cramerV.html
cv = cramerV(as.character(x), as.character(y), bias.correct = adjust_cramersv_bias)
data.frame(xName, yName, assoc=cv, type="cramersV")
}else if(is_numeric(x) && is_numeric(y)){
correlation = cor(x, y, method=cor_method, use="complete.obs")
data.frame(xName, yName, assoc=correlation, type="correlation")
}else if(is_numeric(x) && is_nominal(y)){
# from https://stats.stackexchange.com/questions/119835/correlation-between-a-nominal-iv-and-a-continuous-dv-variable/124618#124618
r_squared = summary(lm(x ~ y))$r.squared
data.frame(xName, yName, assoc=sqrt(r_squared), type="anova")
}else if(is_nominal(x) && is_numeric(y)){
r_squared = summary(lm(y ~x))$r.squared
data.frame(xName, yName, assoc=sqrt(r_squared), type="anova")
}else {
warning(paste("unmatched column type combination: ", class(x), class(y)))
return(NULL)
}
# finally add complete obs number and ratio to table
result %>% mutate(complete_obs_pairs=sum(!is.na(x) & !is.na(y)), complete_obs_ratio=complete_obs_pairs/length(x)) %>% rename(x=xName, y=yName)
}
# apply function to each variable combination
map2_df(df_comb$X1, df_comb$X2, f)
}
mixed_assoc(metadata)
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: logical
## factor
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: logical
## factor
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: logical
## factor
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: logical
## numeric
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: factor
## logical
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: factor
## logical
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: factor
## logical
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: numeric
## logical
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: logical
## logical
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: numeric
## logical
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: logical
## numeric
## x y assoc
## Cramer V...1 donor donor 1.00000000
## Cramer V...2 simple_timepoint donor 0.00000000
## Cramer V...3 severity donor 0.84240000
## ...4 days_since_symptom_onset donor 0.68386487
## ...5 TMM donor 0.71400293
## Cramer V...6 donor simple_timepoint 0.00000000
## Cramer V...7 simple_timepoint simple_timepoint 1.00000000
## Cramer V...8 severity simple_timepoint 0.44450000
## ...9 days_since_symptom_onset simple_timepoint 0.77262321
## ...10 TMM simple_timepoint 0.42901040
## Cramer V...11 donor severity 0.84240000
## Cramer V...12 simple_timepoint severity 0.44450000
## Cramer V...13 severity severity 1.00000000
## ...14 days_since_symptom_onset severity 0.51794104
## ...15 TMM severity 0.43571360
## ...16 donor days_since_symptom_onset 0.68386487
## ...17 simple_timepoint days_since_symptom_onset 0.77262321
## ...18 severity days_since_symptom_onset 0.51794104
## ...19 days_since_symptom_onset days_since_symptom_onset 1.00000000
## ...20 TMM days_since_symptom_onset -0.07103901
## ...21 donor TMM 0.71400293
## ...22 simple_timepoint TMM 0.42901040
## ...23 severity TMM 0.43571360
## ...24 days_since_symptom_onset TMM -0.07103901
## ...25 TMM TMM 1.00000000
## type complete_obs_pairs complete_obs_ratio
## Cramer V...1 cramersV 34 1
## Cramer V...2 cramersV 34 1
## Cramer V...3 cramersV 34 1
## ...4 anova 34 1
## ...5 anova 34 1
## Cramer V...6 cramersV 34 1
## Cramer V...7 cramersV 34 1
## Cramer V...8 cramersV 34 1
## ...9 anova 34 1
## ...10 anova 34 1
## Cramer V...11 cramersV 34 1
## Cramer V...12 cramersV 34 1
## Cramer V...13 cramersV 34 1
## ...14 anova 34 1
## ...15 anova 34 1
## ...16 anova 34 1
## ...17 anova 34 1
## ...18 anova 34 1
## ...19 correlation 34 1
## ...20 correlation 34 1
## ...21 anova 34 1
## ...22 anova 34 1
## ...23 anova 34 1
## ...24 correlation 34 1
## ...25 correlation 34 1
Heatmap of correlation
metadata %>%
mixed_assoc() %>%
mutate(assoc = round(assoc, 3)) |>
select(x, y, assoc) %>%
ggplot(aes(x,y,fill=assoc))+
geom_tile()+
geom_text(aes(x,y,label=assoc))+
scale_fill_gradient2() +
theme_classic()
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: logical
## factor
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: logical
## factor
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: logical
## factor
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: logical
## numeric
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: factor
## logical
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: factor
## logical
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: factor
## logical
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: numeric
## logical
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: logical
## logical
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: numeric
## logical
## Warning in .f(.x[[i]], .y[[i]], ...): unmatched column type combination: logical
## numeric
Pair plot, all vs all.
GGally::ggpairs(metadata, aes(color = severity)) + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.