Contents

In these analyses, we compare differential expression findings from two different studies, each of which looked to compare the transcriptomes of human breast cancer samples.

Data herein are labeled as follows:

1 Load R-packages

## load libraries
library('recount')
library('SummarizedExperiment')
library('limma')
library('edgeR')
library('qvalue')
library('matrixStats')
library('RSkittleBrewer')
library('IHW')

We first download data for the project of interest (SRP019936), obtaining expression data. Data can be summarized across samples and genes using colData() and rowData(), respectively.

2 Gene level analysis

## Find the project of interest (SRP019936), e.g. with parts of the abstract
project_info1 <- abstract_search('model for HER2 positive breast tumors')
project_info1
##     number_samples species
## 640             32   human
abstract
## 640 The goal of our study is to build an integrated transcriptome landscape model for HER2 positive breast tumors and identify the crucial signaling pathways associated with HER2 tumors. Genomic features include, 685 genes that were differentially expressed only in HER2-positive tumors, 102 genes that were alternatively spliced in a pattern that is unique to HER2-positive tumors, and 303 genes that expressed single nucleotide sequence variants (eSNVs) that were unique to HER2-positive tumors. Network analysis was performed to integrate the genomic features into a transcriptome landscape model that identified 12 highly interconnected cellular processes that appear to be critical to the establishment and maintenance of HER2-positive tumors. We observed that integrin signaling was linked to lapatinib sensitivity in vitro and strongly associated with risk of relapse in the NCCTG N9831 adjuvant trastuzumab clinical trial dataset. Overall design: We analyzed RNA-seq data from a survey panel consisting of 8 benign breast lesions, 8 ER+, 8 triple negative, and 8 HER2-positive primary breast tumors to identify genomic features that were uniquely associated with HER2-positive tumors
##       project
## 640 SRP019936
## Download the gene-level RangedSummarizedExperiment data
if(!file.exists(file.path('SRP019936', 'rse_gene.Rdata'))) {
    download_study(project_info1$project)
}

## Load the data
load(file.path(project_info1$project, 'rse_gene.Rdata'))
rse_gene1 <- rse_gene

## Browse the project at SRA
browse_study(project_info1$project)

## This is the sample phenotype data provided by the recount project
colData(rse_gene1)
## DataFrame with 32 rows and 21 columns
##               project      sample  experiment         run
##           <character> <character> <character> <character>
## SRR791043   SRP019936   SRS403393   SRX254189   SRR791043
## SRR791044   SRP019936   SRS403394   SRX254190   SRR791044
## SRR791045   SRP019936   SRS403395   SRX254191   SRR791045
## SRR791046   SRP019936   SRS403396   SRX254192   SRR791046
## SRR791047   SRP019936   SRS403397   SRX254193   SRR791047
## ...               ...         ...         ...         ...
## SRR791070   SRP019936   SRS403420   SRX254216   SRR791070
## SRR791071   SRP019936   SRS403421   SRX254217   SRR791071
## SRR791072   SRP019936   SRS403422   SRX254218   SRR791072
## SRR791073   SRP019936   SRS403423   SRX254219   SRR791073
## SRR791074   SRP019936   SRS403424   SRX254220   SRR791074
##           read_count_as_reported_by_sra reads_downloaded
##                               <integer>        <integer>
## SRR791043                     105978126        105978126
## SRR791044                      95333634         95333634
## SRR791045                      99628684         99628684
## SRR791046                     101469302        101469302
## SRR791047                     105908146        105908146
## ...                                 ...              ...
## SRR791070                      48067496         48067496
## SRR791071                      43365202         43365202
## SRR791072                      40514396         40514396
## SRR791073                      43101642         43101642
## SRR791074                      57371422         57371422
##           proportion_of_reads_reported_by_sra_downloaded paired_end
##                                                <numeric>  <logical>
## SRR791043                                              1       TRUE
## SRR791044                                              1       TRUE
## SRR791045                                              1       TRUE
## SRR791046                                              1       TRUE
## SRR791047                                              1       TRUE
## ...                                                  ...        ...
## SRR791070                                              1       TRUE
## SRR791071                                              1       TRUE
## SRR791072                                              1       TRUE
## SRR791073                                              1       TRUE
## SRR791074                                              1       TRUE
##           sra_misreported_paired_end mapped_read_count        auc
##                            <logical>         <integer>  <numeric>
## SRR791043                      FALSE         104178189 5176960114
## SRR791044                      FALSE          93149787 4618143004
## SRR791045                      FALSE          94582133 4693785750
## SRR791046                      FALSE          91480736 4532961517
## SRR791047                      FALSE         103567907 5140850573
## ...                              ...               ...        ...
## SRR791070                      FALSE          47433080 2360019131
## SRR791071                      FALSE          42745233 2119904288
## SRR791072                      FALSE          39881890 1980303339
## SRR791073                      FALSE          42431338 2108493246
## SRR791074                      FALSE          55132710 2714772264
##           sharq_beta_tissue sharq_beta_cell_type biosample_submission_date
##                 <character>          <character>               <character>
## SRR791043            breast                  esc   2013-03-22T11:37:31.457
## SRR791044            breast                  esc   2013-03-22T11:37:31.533
## SRR791045            breast                  esc   2013-03-22T11:37:31.583
## SRR791046            breast                  esc   2013-03-22T11:37:31.623
## SRR791047            breast                  esc   2013-03-22T11:37:31.663
## ...                     ...                  ...                       ...
## SRR791070            breast                  esc   2013-03-22T11:37:32.697
## SRR791071            breast                  esc   2013-03-22T11:37:32.747
## SRR791072            breast                  esc   2013-03-22T11:37:32.783
## SRR791073            breast                  esc   2013-03-22T11:37:32.817
## SRR791074            breast                  esc   2013-03-22T11:37:32.853
##           biosample_publication_date   biosample_update_date
##                          <character>             <character>
## SRR791043    2013-12-07T01:12:55.003 2014-03-06T17:06:22.413
## SRR791044    2013-12-07T01:12:57.767 2014-03-06T17:06:22.445
## SRR791045    2013-12-07T01:13:02.953 2014-03-06T17:06:22.483
## SRR791046    2013-12-07T01:13:00.473 2014-03-06T17:06:22.515
## SRR791047    2013-12-07T01:18:43.917 2014-03-06T17:06:22.546
## ...                              ...                     ...
## SRR791070    2013-12-07T01:18:31.917 2014-03-06T17:06:23.633
## SRR791071    2013-12-07T01:18:27.567 2014-03-06T17:06:23.664
## SRR791072    2013-12-07T01:18:26.017 2014-03-06T17:06:23.696
## SRR791073    2013-12-07T01:18:39.517 2014-03-06T17:06:23.728
## SRR791074    2013-12-07T01:18:28.853 2014-03-06T17:06:23.765
##           avg_read_length geo_accession  bigwig_file       title
##                 <integer>   <character>  <character> <character>
## SRR791043             100    GSM1103987 SRR791043.bw  BCT04_mRNA
## SRR791044             100    GSM1103988 SRR791044.bw  BCT12_mRNA
## SRR791045             100    GSM1103989 SRR791045.bw  BCT14_mRNA
## SRR791046             100    GSM1103990 SRR791046.bw  BCT16_mRNA
## SRR791047             100    GSM1103991 SRR791047.bw  BCT22_mRNA
## ...                   ...           ...          ...         ...
## SRR791070             100    GSM1104014 SRR791070.bw BSO32N_mRNA
## SRR791071             100    GSM1104015 SRR791071.bw  BSO36_mRNA
## SRR791072             100    GSM1104016 SRR791072.bw  BSO37_mRNA
## SRR791073             100    GSM1104017 SRR791073.bw DHF168_mRNA
## SRR791074             100    GSM1104018 SRR791074.bw OP-535_mRNA
##                                     characteristics
##                                     <CharacterList>
## SRR791043             tissue type: ER+ Breast Tumor
## SRR791044             tissue type: ER+ Breast Tumor
## SRR791045             tissue type: ER+ Breast Tumor
## SRR791046             tissue type: ER+ Breast Tumor
## SRR791047             tissue type: ER+ Breast Tumor
## ...                                             ...
## SRR791070     tissue type: Benign cell lines (HMEC)
## SRR791071     tissue type: Benign cell lines (HMEC)
## SRR791072     tissue type: Benign cell lines (HMEC)
## SRR791073     tissue type: Benign cell lines (HMEC)
## SRR791074 tissue type: Triple Negative Breast Tumor
## Gene info 
rowData(rse_gene1)
## DataFrame with 58037 rows and 3 columns
##                  gene_id bp_length          symbol
##              <character> <integer> <CharacterList>
## 1     ENSG00000000003.14      4535          TSPAN6
## 2      ENSG00000000005.5      1610            TNMD
## 3     ENSG00000000419.12      1207            DPM1
## 4     ENSG00000000457.13      6883           SCYL3
## 5     ENSG00000000460.16      5967        C1orf112
## ...                  ...       ...             ...
## 58033  ENSG00000283695.1        61              NA
## 58034  ENSG00000283696.1       997              NA
## 58035  ENSG00000283697.1      1184    LOC101928917
## 58036  ENSG00000283698.1       940              NA
## 58037  ENSG00000283699.1        60         MIR4481

Downloaded count data are first scaled to take into account differing coverage between samples. Phenotype data (pheno) are obtained and ordered to match the sample order of the gene expression data (rse_gene). Only those samples that are HER2-positive or TNBC are included for analysis. Prior to differential gene expression analysis, count data are obtained in matrix format and then filtered to only include those genes with greater than five average normalized counts across all samples.

## Scale counts by taking into account the total coverage per sample
rse1 <- scale_counts(rse_gene1)

## Download pheno data from 
## http://trace.ncbi.nlm.nih.gov/Traces/study/?acc=SRP019936
pheno1 <- read.table('SraRunTable_SRP019936.txt', sep = '\t', 
    header=TRUE,
    stringsAsFactors = FALSE)

## Obtain correct order for pheno data
pheno1 <- pheno1[match(rse1$run, pheno1$Run_s), ]
identical(pheno1$Run_s, rse1$run)
## [1] TRUE
head(cbind(pheno1$Run_s, rse1$run))
##      [,1]        [,2]       
## [1,] "SRR791043" "SRR791043"
## [2,] "SRR791044" "SRR791044"
## [3,] "SRR791045" "SRR791045"
## [4,] "SRR791046" "SRR791046"
## [5,] "SRR791047" "SRR791047"
## [6,] "SRR791048" "SRR791048"
## Obtain grouping information
colData(rse1)$group <- pheno1$tissue_s
table(colData(rse1)$group)   
## 
##     Benign cell lines (HMEC)             ER+ Breast Tumor 
##                            8                            8 
##           HER2+ Breast Tumor Triple Negative Breast Tumor 
##                            8                            8
## subset data to HER2 and TNBC types
rse1 <- rse1[, rse1$group %in% c('HER2+ Breast Tumor', 'Triple Negative Breast Tumor')]
rse1
## class: RangedSummarizedExperiment 
## dim: 58037 16 
## metadata(0):
## assays(1): counts
## rownames(58037): ENSG00000000003.14 ENSG00000000005.5 ...
##   ENSG00000283698.1 ENSG00000283699.1
## rowData names(3): gene_id bp_length symbol
## colnames(16): SRR791051 SRR791052 ... SRR791065 SRR791074
## colData names(22): project sample ... characteristics group
## Obtain count matrix
counts1 <- assays(rse1)$counts

## Filter count matrix
filter <- apply(counts1, 1, function(x) mean(x) > 5)
counts1 <- counts1[filter, ]
dim(counts1)
## [1] 29163    16

To get a better sense of the data, we plot the mean-variance relationship for each gene. Similarly, we run principal component analysis (PCA) to identify any sample outliers within the data. We assess the variance explained by each of the first 11 PCs as well as visualize the relationship of each sample in the first two PCs.

## Set colors 
trop <- RSkittleBrewer('tropical')[c(1, 2)]
cols <- as.numeric(as.factor(rse1$group))

## Look at mean variance relationship
plot(rowMeans(log2(counts1 + 1)), rowVars(log2(counts1 + 1)),
     pch = 19, col = trop[2])

## Calculate PCs with svd function
expr.pca <- svd(counts1 - rowMeans(counts1))

## Plot PCs
par(font.lab = 2, cex.lab = 1.2, font.axis = 2, cex.axis = 1.2)
plot(expr.pca$d^2/sum(expr.pca$d^2), pch = 19, col = trop[2], cex = 1.5,
     ylab = 'Percent of variance explained (gene level)', xlab = 'PC #',
     main = 'PCs')

## Plot PC1 vs. PC2
par(font.lab = 2, cex.lab = 1.2, font.axis = 2, cex.axis = 1.2)
plot(expr.pca$v[, 1], expr.pca$v[, 2], pch = 19, col = trop[cols], cex = 1.5,
     xlab = 'PC1', ylab = 'PC2',
     main = 'PC (gene level)')
legend('topright', pch = 19, col = trop[c(1, 2)],
       names(summary(as.factor(rse1$group))))

PCA identifies a clear sample outlier in these data. This sample is removed from analysis prior to moving forward with differential expression analyses. As mentioned previously, prior to differential gene expression analysis, count data are filtered to only include those genes with greater than five average normalized counts across all samples.

## Scale counts by taking into account the total coverage per sample
rse1 <- scale_counts(rse_gene1)

## Download pheno data from 
## http://trace.ncbi.nlm.nih.gov/Traces/study/?acc=SRP019936
pheno1 <- read.table('SraRunTable_SRP019936.txt', sep = '\t', 
    header=TRUE,
    stringsAsFactors = FALSE)

## Obtain correct order for pheno data
pheno1 <- pheno1[match(rse1$run, pheno1$Run_s), ]
identical(pheno1$Run_s, rse1$run)
## [1] TRUE
head(cbind(pheno1$Run_s, rse1$run))
##      [,1]        [,2]       
## [1,] "SRR791043" "SRR791043"
## [2,] "SRR791044" "SRR791044"
## [3,] "SRR791045" "SRR791045"
## [4,] "SRR791046" "SRR791046"
## [5,] "SRR791047" "SRR791047"
## [6,] "SRR791048" "SRR791048"
## Obtain grouping information
colData(rse1)$group <- pheno1$tissue_s
table(colData(rse1)$group)   
## 
##     Benign cell lines (HMEC)             ER+ Breast Tumor 
##                            8                            8 
##           HER2+ Breast Tumor Triple Negative Breast Tumor 
##                            8                            8
## Subset data to HER2 and TNBC types
rse1 <- rse1[, rse1$group %in% c('HER2+ Breast Tumor', 'Triple Negative Breast Tumor')]
rse1
## class: RangedSummarizedExperiment 
## dim: 58037 16 
## metadata(0):
## assays(1): counts
## rownames(58037): ENSG00000000003.14 ENSG00000000005.5 ...
##   ENSG00000283698.1 ENSG00000283699.1
## rowData names(3): gene_id bp_length symbol
## colnames(16): SRR791051 SRR791052 ... SRR791065 SRR791074
## colData names(22): project sample ... characteristics group
## Remove outlier sample
rse1 <- rse1[, -15]
rse1
## class: RangedSummarizedExperiment 
## dim: 58037 15 
## metadata(0):
## assays(1): counts
## rownames(58037): ENSG00000000003.14 ENSG00000000005.5 ...
##   ENSG00000283698.1 ENSG00000283699.1
## rowData names(3): gene_id bp_length symbol
## colnames(15): SRR791051 SRR791052 ... SRR791064 SRR791074
## colData names(22): project sample ... characteristics group
## Obtain count matrix
counts1 <- assays(rse1)$counts

## Filter count matrix
filter <- apply(counts1, 1, function(x) mean(x) > 5)
counts1 <- counts1[filter, ]
dim(counts1)
## [1] 29323    15

After sample outlier removal, PCA is again run to obtain a global understanding of the mean-variance relationship at each gene and the global relationship between samples included for study.

## Set colors 
trop <- RSkittleBrewer('tropical')[c(1, 2)]
cols <- as.numeric(as.factor(rse1$group))

## Look at mean variance relationship
plot(rowMeans(log2(counts1 + 1)), rowVars(log2(counts1 + 1)),
     pch = 19, col = trop[2])

## Calculate PCs with svd function
expr.pca <- svd(counts1 - rowMeans(counts1))

## Plot PCs
par(font.lab = 2, cex.lab = 1.2, font.axis = 2, cex.axis = 1.2)
plot(expr.pca$d^2/sum(expr.pca$d^2), pch = 19, col = trop[2], cex = 1.5,
     ylab = 'Percent of variance explained (gene level)', xlab = 'PC #',
     main = 'PCs')

## Plot PC1 vs. PC2
par(font.lab = 2, cex.lab = 1.2, font.axis = 2, cex.axis = 1.2)
plot(expr.pca$v[, 1], expr.pca$v[, 2], pch = 19, col = trop[cols], cex = 1.5,
     xlab = 'PC1', ylab = 'PC2',
     main = 'PC (gene level)')
legend('topright', pch = 19, col = trop[c(1, 2)],
       names(summary(as.factor(rse1$group))))

Differential gene expression between TNBC and HER2-positive samples is determined using limma and voom. Differentially expressed genes are visualized using a volcano plot to compare the effect size of the differential expression [ as measured by the \(log_2(fold change)\) in expression ] and its significance [ \(-log_10(p-value)\) ].

## Perform differential expression analysis with limma-voom
design <- model.matrix(~ rse1$group)
design
##    (Intercept) rse1$groupTriple Negative Breast Tumor
## 1            1                                      1
## 2            1                                      0
## 3            1                                      1
## 4            1                                      0
## 5            1                                      1
## 6            1                                      0
## 7            1                                      0
## 8            1                                      0
## 9            1                                      0
## 10           1                                      1
## 11           1                                      0
## 12           1                                      0
## 13           1                                      1
## 14           1                                      1
## 15           1                                      1
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$`rse1$group`
## [1] "contr.treatment"
dge <- DGEList(counts = counts1)
dge <- calcNormFactors(dge)
v <- voom(dge, design,plot = TRUE)

fit <- lmFit(v, design)
fit <- eBayes(fit)
log2FC1 <- fit$coefficients[, 2]
t.mod1 <- fit$t[, 2]
p.mod1 <- fit$p.value[, 2]
q.mod1 <- qvalue(p.mod1)$q

## Histogram of p-values
par(font.lab = 2, cex.lab = 1.2, font.axis = 2, cex.axis = 1.2)
hist(p.mod1, col = trop[2], xlab = 'p-value',
     main = 'Histogramm of p-values', breaks = 100)

## Volcano plot
par(font.lab = 2, cex.lab = 1.2, font.axis = 2, cex.axis = 1.2)
rx2 <- c(-1, 1) * 1.1 * max(abs(log2FC1))
ry2 <- c(-0.1, max(-log10(p.mod1))) * 1.1
plot(log2FC1, -log10(p.mod1), 
     pch = 19, xlim = rx2, ylim = ry2, col = trop[2],
     xlab = bquote(paste(log[2], ' (fold change)')), 
     ylab = bquote(paste(-log[10], ' (p-value)')))
abline(v = seq(-10, 10, 1), col = 'lightgray', lty = 'dotted')
abline(h = seq(0, 23, 1), col = 'lightgray', lty = 'dotted')
points(log2FC1, -log10(p.mod1), pch = 19, col = trop[2])
title('Volcano plot: TNBC vs. HER2+ in SRP019936 (gene level)')

To compare these findings back to the breast cancer transcriptome data used to identify differential gene, exon, expressed region, and junction (SRP032798), we must again acquire these data, filter the read counts, and summarize gene expreesion as explained previously.

3 Independence hypotheses weighting

## Find second project of interest (SRP032789), e.g. with parts of the abstract
project_info2 <- abstract_search('To define the digital transcriptome of three breast cancer')

## Download the gene-level RangedSummarizedExperiment data
if(!file.exists(file.path('SRP032789', 'rse_gene.Rdata'))) {
    download_study(project_info2$project)
}

## Load the data
load(file.path(project_info2$project, 'rse_gene.Rdata'))
rse_gene2 <- rse_gene

## Scale counts by taking into account the total coverage per sample
rse2 <- scale_counts(rse_gene2)

## Download additional phenotype data from 
## http://trace.ncbi.nlm.nih.gov/Traces/study/?acc=SRP032789
pheno2 <- read.table('SraRunTable_SRP032789.txt', sep = '\t', 
                    header=TRUE,
                    stringsAsFactors = FALSE)

## Obtain correct order for pheno data
pheno2 <- pheno2[match(rse2$run, pheno2$Run_s), ]
identical(pheno2$Run_s, rse2$run)
## [1] TRUE
head(cbind(pheno2$Run_s, rse2$run))
##      [,1]         [,2]        
## [1,] "SRR1027171" "SRR1027171"
## [2,] "SRR1027173" "SRR1027173"
## [3,] "SRR1027174" "SRR1027174"
## [4,] "SRR1027175" "SRR1027175"
## [5,] "SRR1027176" "SRR1027176"
## [6,] "SRR1027177" "SRR1027177"
## Obtain grouping information
colData(rse2)$group <- pheno2$tumor_type_s
table(colData(rse2)$group)   
## 
## HER2 Positive Breast Tumor      Non-TNBC Breast Tumor 
##                          5                          6 
##    Normal Breast Organoids          TNBC Breast Tumor 
##                          3                          6
## Subset data to HER2 and TNBC types
rse2 <- rse2[, rse2$group %in% c('HER2 Positive Breast Tumor', 'TNBC Breast Tumor')]
rse2 
## class: RangedSummarizedExperiment 
## dim: 58037 11 
## metadata(0):
## assays(1): counts
## rownames(58037): ENSG00000000003.14 ENSG00000000005.5 ...
##   ENSG00000283698.1 ENSG00000283699.1
## rowData names(3): gene_id bp_length symbol
## colnames(11): SRR1027171 SRR1027173 ... SRR1027187 SRR1027172
## colData names(22): project sample ... characteristics group
## Obtain count matrix without filtering
counts2 <- assays(rse2)$counts
dim(counts2)
## [1] 58037    11

With count data from both studies, we will run PCA to assess global expression patterns across studies and samples.

4 Cross-study PCA

## Combine expression data across studies
combined_counts <- merge(counts1, counts2, by="row.names")
rownames(combined_counts) <- combined_counts$Row.names
combined_counts <- combined_counts[,-1]

#make sure phenotypes are annotated the same way
combined_pheno <- c(rse1$group,rse2$group)
combined_pheno <- gsub("Triple Negative Breast Tumor","TNBC Breast Tumor",combined_pheno)
combined_pheno <- gsub("HER2 Positive Breast Tumor","HER2+ Breast Tumor",combined_pheno)

## Calculate PCs with svd function
expr.pca <- svd(combined_counts - rowMeans(combined_counts))

## Plot PCs
par(font.lab = 2, cex.lab = 1.2, font.axis = 2, cex.axis = 1.2)
plot(expr.pca$d^2/sum(expr.pca$d^2), pch = 19, col = trop[2], cex = 1.5,
     ylab = 'Percent of variance explained (gene level)', xlab = 'PC #',
     main = 'PCs')