library(DESeq2)
library(GEOquery)
library(SummarizedExperiment)
library(dplyr)2 METABRIC
2.1 Packages to use
First load the packages that will be used along the cleaning process. Always do this at the beginning of your script to make things organized.
2.2 Introduction
To download the clinical data and expression levels from the METABRIC cohort go to cbioportal and select the respective cohort (METABRIC): https://www.cbioportal.org/
Here we will load and format the metabric data in the same way as SCAN-B and the AI dataset, so it is standardized and better to use in future analysis. For this, we use the summarized experiment object to store expression data and clinical information.
2.3 Downloading data
Loading gene expression levels.
expression_data <- read.csv(
"data_mrna_agilent_microarray.txt",
sep = "\t",
check.names = FALSE
) %>% dplyr::mutate(Entrez_Gene_Id = NULL) %>%
`colnames<-`(stringr::str_replace_all(colnames(.), stringr::fixed("."), "-"))
dim(expression_data)We note that the first two columns correspond to HUGO symbol and ENTREZ ID.
Load now clinical data.
clinical_data <- read.csv(
"data_clinical_patient.txt",
sep = "\t",
comment.char = "#"
)
glimpse(clinical_data)2.4 Cleaning and integrating
We see that there are over 2500 rows, meaning that we have more patients with clinical data than expression levels. Let us now select patients that have expression levels.
length(unique(clinical_data$PATIENT_ID)) == nrow(clinical_data)We see that each row has a unique identifier, the patient ID, so we set this as a rowname.
rownames(clinical_data) <- clinical_data$PATIENT_IDAnd we check if all patients from expression data have clinical data.
length(
intersect(
clinical_data$PATIENT_ID,
colnames(expression_data[, -c(1,2)])
)
) == ncol(expression_data)-2Indeed it has, therefore we can just subselect clinical data.
duplicated_genes <- expression_data$Hugo_Symbol[
duplicated(expression_data$Hugo_Symbol)
]
hugo_symbols_duplicated <- expression_data$Hugo_Symbol[
duplicated(expression_data$Hugo_Symbol)
] %>% unique
median_genes <- sapply(
hugo_symbols_duplicated,
function(gene, df){
df %>% dplyr::filter(Hugo_Symbol == gene) %>%
dplyr::mutate(Hugo_Symbol = NULL) %>%
as.matrix(.) %>%
MatrixGenerics::colMedians(.)
},
df = expression_data
)
metabric_exp <- expression_data[!duplicated(expression_data$Hugo_Symbol), ]
rownames(metabric_exp) <- expression_data$Hugo_Symbol[
!duplicated(expression_data$Hugo_Symbol)
]
metabric_exp$Hugo_Symbol <- NULL
metabric_exp[colnames(median_genes), ] <- median_genes %>% tclinical_data <- clinical_data[
intersect(colnames(metabric_exp), rownames(clinical_data)),
]Before we just average the duplicated genes median intensity.
# add new clinical data to the summarized experiment object.
metabric <- SummarizedExperiment::SummarizedExperiment(
assays = list(
median_intensity = metabric_exp[, rownames(clinical_data)]
),
colData = clinical_data
)And we can finally save the RDS file to load it up faster later.
# save the final dataset
saveRDS(
metabric,
file = file.path("metabric_filtered.rds")
)