library(dplyr)
library(TCGAbiolinks)
library(SummarizedExperiment)
library(DESeq2)
library(edgeR)
library(rtracklayer)
1 TCGA
Get dataset from tcga following procedure in https://www.bioconductor.org/packages/release/workflows/vignettes/SingscoreAMLMutations/inst/doc/workflow_transcriptional_mut_sig.html#3_Downloading_and_preparing_the_data
setwd(file.path("~", "Documents", "BioRepo"))
<- TCGAbiolinks::GDCquery(
query_rna project = 'TCGA-BRCA',
data.category = 'Transcriptome Profiling',
data.type = 'Gene Expression Quantification',
workflow.type = 'HTSeq - Counts'
)
<- TCGAbiolinks::getResults(query_rna)
rnaseq_res # the number of rows corresponds to the number of rna-seq samples. notice
# that you can have two samples coming from the same patient.
dim(rnaseq_res)
Path to download data:
<- tempdir()
tmp_dir <- file.path(tmp_dir, 'GDCdata')
datapath ::GDCdownload(query_rna, directory = datapath)
TCGAbiolinks<- TCGAbiolinks::GDCprepare(query_rna, directory = datapath) brca_se
We will save the steps in the meantime in case there is a problem in the pipeline, therefore we don’t redo all the steps
# save brca_se containing all samples
saveRDS(
brca_se, file.path("Data", "20210313_tcga_brca", "tcga_brca_all.rds")
)
<- readRDS("Data/20210313_tcga_brca/tcga_brca_all.rds")
brca_se # remove samples that we are not interested. so we only get tumor samples
# from patients with clinical data. we can get the same patients as
# from the clinical_data dataframe
<- colData(brca_se)
clinical_data_gdc <- clinical_data_gdc[!clinical_data_gdc$is_ffpe, ]
clinical_data_gdc <- clinical_data_gdc[clinical_data_gdc$sample_type_id == "01", ] clinical_data_gdc
We now filter the clinical data to remove those patient in stage x as these are tumors that couldn’t be evaluated
<-
clinical_data_gdc !is.na(clinical_data_gdc$tumor_stage), ]
clinical_data_gdc[<-
clinical_data_gdc $tumor_stage != "stage x", ]
clinical_data_gdc[clinical_data_gdc
# get molecular subtypes and append to clinical data
<- TCGAbiolinks::PanCancerAtlas_subtypes() %>%
subtypes ::filter(cancer.type == "BRCA") %>%
dplyr::filter(substring(pan.samplesID, 14, 15) == "01") %>%
dplyr::filter(!duplicated(pan.samplesID)) %>% data.frame()
dplyr
rownames(subtypes) <- subtypes$pan.samplesID
$molecular_subtype <-
clinical_data_gdcrownames(clinical_data_gdc), "Subtype_mRNA"]
subtypes[
# remove patients without molecular subtype data
<-
clinical_data_gdc !is.na(clinical_data_gdc$molecular_subtype), ] clinical_data_gdc[
We now calculate the time to death or last followup and append to the clinical data. this is the time to event we are analysing.
<-
time_status_patients apply(
clinical_data_gdc,1,
function(x){
<- x[["days_to_death"]]
days_to_death <- x[["days_to_last_follow_up"]]
days_to_last_follow_up <- c()
status_patient
if(is.na(days_to_death) && is.na(days_to_last_follow_up)){
<- NA
time_patient <- c(status_patient, NA)
status_patient else if (is.na(days_to_death)) {
} <- as.numeric(days_to_last_follow_up)
time_patient <- c(status_patient, 1)
status_patient else {
} <- as.numeric(days_to_death)
time_patient <- c(status_patient, 2)
status_patient
}
c(time_patient, status_patient)
}
)
$status <- time_status_patients[2, ]
clinical_data_gdc$time <- time_status_patients[1, ]
clinical_data_gdc
# remove the patients with no time or status
<- clinical_data_gdc[
clinical_data_gdc !is.na(clinical_data_gdc$time)) & (!is.na(clinical_data_gdc$status)),
(
]
<- clinical_data_gdc[
clinical_data_gdc $time != 0,
clinical_data_gdc
]
Let us add the ER and PR status for the patients. For this we need to use another dataset that was downloaded from GDC.
<-
clinical_data_er read.csv(
file.path(
"Data",
"20210313_tcga_brca",
"clinical_data",
"nationwidechildrens.org_clinical_patient_brca.txt"
), header = 1,
sep = "\t"
)
rownames(clinical_data_er) <- clinical_data_er$bcr_patient_barcode
# add er and pr status to original clinical data
$er_status <-
clinical_data_gdc
clinical_data_er["patient"],
clinical_data_gdc[, "breast_carcinoma_estrogen_receptor_status"
]
$pr_status <-
clinical_data_gdc
clinical_data_er["patient"],
clinical_data_gdc[, "breast_carcinoma_progesterone_receptor_status"
]
# keep only the patients filtered above
<- brca_se[, rownames(clinical_data_gdc)]
brca_se colData(brca_se) <- clinical_data_gdc
saveRDS(brca_se, file.path(tmp_dir, "brca_se_filter.Rdata"))
# filter genes lowly expressed in 30% of the samples
<- edgeR::DGEList(counts = assay(brca_se), genes = rowData(brca_se))
brca_dge <- rowMeans(cpm(brca_dge) > 1)
prop_expressed <- prop_expressed > 0.3
keep
# check the distribution of the counts
= par(no.readonly = TRUE)
op par(mfrow = c(1, 2))
hist(cpm(brca_dge, log = TRUE), main = 'Unfiltered', xlab = 'logCPM')
abline(v = log(1), lty = 2, col = 2)
hist(cpm(brca_dge[keep, ], log = TRUE), main = 'Filtered', xlab = 'logCPM')
abline(v = log(1), lty = 2, col = 2)
<- brca_dge[keep, , keep.lib.sizes = FALSE]
brca_dge <- brca_se[keep, ]
brca_se
saveRDS(
brca_se, file.path(tmp_dir, "brca_se_filter_gene.Rdata")
)
Now we get the gene length information for the normalization step.
<- 'gencode.v22.annotation.gtf.gz'
gencode_file <-
gencode_link paste(
'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_22',
gencode_file,sep = '/'
)
<- file.path("Data", "20210313_tcga_brca", gencode_file)
path_to_gencode
download.file(gencode_link, path_to_gencode, method = 'libcurl')
<- rtracklayer::import.gff(
gtf
path_to_gencode, format = 'gtf',
genome = 'GRCm38.71',
feature.type = 'exon'
)
# split records by gene to group exons of the same gene
<- reduce(split(gtf, elementMetadata(gtf)$gene_id))
grl <-
gene_lengths sapply(
grl, function(x) {
#sum up the length of individual exons
c('gene_length' = sum(width(x)))
}
)
<-
gene_lengths_tmp data.frame(gene_length = gene_lengths, row.names = names(grl))
#extract information on gene biotype
<- unique(elementMetadata(gtf)[, c('gene_id', 'gene_type')])
genetype colnames(genetype)[1] <- 'ensembl_gene_id'
rownames(genetype) <- genetype$ensembl_gene_id
$gene_type <-
gene_lengths_tmprownames(gene_lengths_tmp), "gene_type"]
genetype[$ensembl_gene_id <-
gene_lengths_tmprownames(gene_lengths_tmp), "ensembl_gene_id"]
genetype[
#remove ENSEMBL ID version numbers
$ensembl_gene_id <-
gene_lengths_tmpgsub('\\.[0-9]*', '', gene_lengths_tmp$ensembl_gene_id)
saveRDS(
gene_lengths_tmp, file = file.path(tmp_dir, "gene_lengths_HTSeq_gencodev22.rds")
)
<- gene_lengths_tmp
gene_lengths rownames(gene_lengths) <- gene_lengths$ensembl_gene_id
rowData(brca_se)$gene_length <- gene_lengths[rownames(brca_se), 'gene_length']
rowData(brca_se)$gene_biotype <- gene_lengths[rownames(brca_se), 'gene_type']
#annotate gene lengths for the DGE object
$genes$length <- gene_lengths[rownames(brca_dge), 'gene_length']
brca_dge
# calculate the tmm normalization from edgeR
<- calcNormFactors(brca_dge, method = 'TMM')
brca_dge_tmm
# compute log FPKM values and append to assays
assay(brca_se, 'logFPKM_TMM') <- rpkm(brca_dge_tmm, log = TRUE)
assay(brca_se, "vst") <- DESeq2::vst(assay(brca_se))
Save the final dataset.
saveRDS(
brca_se, file = file.path("Data", "20210313_tcga_brca", "tcga_brca_tumor_filtered.rds")
)