library(DESeq2)
library(GEOquery)
library(SummarizedExperiment)
library(dplyr)
2 METABRIC
2.1 Packages to use
First load the packages that will be used along the cleaning process. Always do this at the beginning of your script to make things organized.
2.2 Introduction
To download the clinical data and expression levels from the METABRIC cohort go to cbioportal and select the respective cohort (METABRIC): https://www.cbioportal.org/
Here we will load and format the metabric data in the same way as SCAN-B and the AI dataset, so it is standardized and better to use in future analysis. For this, we use the summarized experiment object to store expression data and clinical information.
2.3 Downloading data
Loading gene expression levels.
<- read.csv(
expression_data "data_mrna_agilent_microarray.txt",
sep = "\t",
check.names = FALSE
%>% dplyr::mutate(Entrez_Gene_Id = NULL) %>%
) `colnames<-`(stringr::str_replace_all(colnames(.), stringr::fixed("."), "-"))
dim(expression_data)
We note that the first two columns correspond to HUGO symbol and ENTREZ ID.
Load now clinical data.
<- read.csv(
clinical_data "data_clinical_patient.txt",
sep = "\t",
comment.char = "#"
)
glimpse(clinical_data)
2.4 Cleaning and integrating
We see that there are over 2500 rows, meaning that we have more patients with clinical data than expression levels. Let us now select patients that have expression levels.
length(unique(clinical_data$PATIENT_ID)) == nrow(clinical_data)
We see that each row has a unique identifier, the patient ID, so we set this as a rowname.
rownames(clinical_data) <- clinical_data$PATIENT_ID
And we check if all patients from expression data have clinical data.
length(
intersect(
$PATIENT_ID,
clinical_datacolnames(expression_data[, -c(1,2)])
)== ncol(expression_data)-2 )
Indeed it has, therefore we can just subselect clinical data.
<- expression_data$Hugo_Symbol[
duplicated_genes duplicated(expression_data$Hugo_Symbol)
]
<- expression_data$Hugo_Symbol[
hugo_symbols_duplicated duplicated(expression_data$Hugo_Symbol)
%>% unique
]
<- sapply(
median_genes
hugo_symbols_duplicated,function(gene, df){
%>% dplyr::filter(Hugo_Symbol == gene) %>%
df ::mutate(Hugo_Symbol = NULL) %>%
dplyras.matrix(.) %>%
::colMedians(.)
MatrixGenerics
},df = expression_data
)
<- expression_data[!duplicated(expression_data$Hugo_Symbol), ]
metabric_exp rownames(metabric_exp) <- expression_data$Hugo_Symbol[
!duplicated(expression_data$Hugo_Symbol)
]$Hugo_Symbol <- NULL
metabric_expcolnames(median_genes), ] <- median_genes %>% t metabric_exp[
<- clinical_data[
clinical_data intersect(colnames(metabric_exp), rownames(clinical_data)),
]
Before we just average the duplicated genes median intensity.
# add new clinical data to the summarized experiment object.
<- SummarizedExperiment::SummarizedExperiment(
metabric assays = list(
median_intensity = metabric_exp[, rownames(clinical_data)]
), colData = clinical_data
)
And we can finally save the RDS file to load it up faster later.
# save the final dataset
saveRDS(
metabric, file = file.path("metabric_filtered.rds")
)