3 Preparing sample information

This chunk reads and explores the samples’ information. Then it selects ER-positive (Estrogen Receptor Positive) and Triple-negative (Estrogen receptor, Progesterone resceptor and HER2 negative) tumours. You may try different comparisons, e.g. the dataset includes some normal tissue samples paired with tumours.

In addition to the ER status, it keeps OCT-imbedding status. OCT-imbedding is a small variation in experimental protocol during RNA extraction. Some of the TCGA BRCA samples were imbedded into OCT prior RNA extraction, while the others were not imbedded. Thus OCT-imbedding could be considered a potentially confounding variable to illustrate dealing with batch effects.

# Read samples information
samples_information_file <- file.path(base_folder,"data","samples_information.txt")
samples_information.df <- read.table(samples_information_file, header=T, sep="\t")

# Explore samples information data
dim(samples_information.df)
## [1] 629  18
colnames(samples_information.df)
##  [1] "file_id"                 "file_name"               "sample_type"             "sample_oct_embedded"     "bcr_patient_barcode"     "age"                     "menopause_status"        "stage"                   "t"                       "n"                       "m"                       "breast_quadrant"         "histology"               "er"                      "pr"                      "her2"                    "analyte_a260_a280_ratio" "aliquot_concentration"
# Explore samples types 
table(samples_information.df$sample_type)
## 
##       Primary Tumor Solid Tissue Normal 
##                 567                  62
# Select tumours
tumours <- samples_information.df$sample_type == "Primary Tumor"

# Select ER-positive samples
er_pos <- samples_information.df$er == "Positive" & 
          samples_information.df$pr == "Positive" & 
          samples_information.df$her2 == "Negative"

# Select Triple-negative samples
triple_neg <- samples_information.df$er == "Negative" & 
              samples_information.df$pr == "Negative" & 
              samples_information.df$her2 == "Negative"

# Combine the conditions and check the count
selected_samples <- tumours & ( er_pos | triple_neg )
sum(selected_samples)
## [1] 238
# Select fields needed for analysis
selected_fields <- c("bcr_patient_barcode","file_name", "sample_oct_embedded", "er")

# Keep only selected samples and columns
samples.df <- samples_information.df[selected_samples, selected_fields]

# Rename the columns: to make the names concise
colnames(samples.df) <- c("patient","file","oct","er") 

# Check the samples data frame
dim(samples.df)
## [1] 238   4
head(samples.df)
##        patient                                                 file   oct       er
## 1 TCGA-A7-A0DA a33029dd-b5fa-4be0-9cbf-971d289146dd.htseq.counts.gz false Negative
## 3 TCGA-D8-A1XU 8d54214a-1d9b-4fea-9c42-5bbb3cd11da9.htseq.counts.gz false Positive
## 5 TCGA-D8-A143 4b19c0e2-2a61-4f0a-9257-4a528e6b320e.htseq.counts.gz false Negative
## 6 TCGA-A7-A4SB cc233ff9-d5fb-4e9b-9007-f58d008df995.htseq.counts.gz false Positive
## 7 TCGA-D8-A1XR bdd8c340-250b-474a-8802-7653b7884ced.htseq.counts.gz false Positive
## 8 TCGA-BH-A18L 5bc7e90d-0fa4-4bde-bee8-4f9b92de03a2.htseq.counts.gz  true Positive
# ER-status vs OCT-embedded 
table(samples.df[,c("oct","er")])
##        er
## oct     Negative Positive
##   false       23       73
##   true        24      118
# Clean-up
rm(samples_information_file, tumours, er_pos, triple_neg, selected_samples, selected_fields, samples_information.df)