TCGA简单数据分析

魔法师LQ

目标

下载TCGA-LUAD数据库中RNA-seq数据,对其进行聚类。

准备工作

根据之前的介绍,下载自己感兴趣的数据,流程如下,

  1. 进入TCGA门户
  2. 全部数据:选择Project->项目(例如TCGA-LUAD)->数据类型(例如RNA-seq)->Manifest;部分数据:Project->项目(例如TCGA-LUAD)->数据类型(例如RNA-seq)->筛选一部分案例(cart旁边的复选框)->Save/Edit Case Set->Manage Sets->View Files In Respository->Files->点击饼状图进行筛选(例如Access Level)->Add All Files to Cart
  3. 使用gdc-client进行下载

下载自己感兴趣的数据类型,否则数据规模会非常的大,解压缩后光读入整合就需要非常大的空间,例如会报出如下错误,

1
#  Error: cannot allocate vector of size 96.9 Gb

尤其是当把多个文件读取并整合的时候。

在验证算法可行性的时候,也应从小样本开始,下载的时候挑选几个案例即可。总而言之就是,

  1. 找自己感兴趣个数据
  2. 从较小的规模开始
  3. (从较为简单的模型开始)

下载后的数据类似如下,

数据整合

以RNA-seq数据为例,将下载后的72个Cases,整合成一个数据框,数据框的大小为(60483,73),每一列表示一个case,每一行表示一种基因(共有60482种)。

为了方便处理,将数据框的names更改为Case-ID对应的TCGA-ID

获取TCGA-ID

调用GCD的API,构建请求用的JSON文件,命名为Payload_test.txt,内容如下,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
{
"filters":{
"op":"in",
"content":{
"field":"files.file_id",
"value":[
"0a5d1b6d-381b-4dfa-ac4e-af4f7152791b",
"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
]
}
},
"format":"TSV",
"fields":"file_id,file_name,cases.submitter_id,cases.case_id,data_category,data_type,cases.samples.tumor_descriptor,cases.samples.tissue_type,cases.samples.sample_type,cases.samples.submitter_id,cases.samples.sample_id,cases.samples.portions.analytes.aliquots.aliquot_id,cases.samples.portions.analytes.aliquots.submitter_id",
"size":"100"
}

下载后执行curl指令,

1
curl --request POST --header "Content-Type:application/json" --data @Payload_test.txt 'https://api.gdc.cancer.gov/files' > ID_transfer_test.txt

从而将请求的信息保存到ID_transfer_test.txt文件中,

聚类分析

参考

[1] TCGA入门——数据下载、整合及简单应用

附录

附录A. RNA-seq整合(R)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151

path <- "~/NewDisk/LungCancer/TCGA/Project02-RNA-seq/small2" # 这里采用绝对路径,之后再试试相对路径
dirs <- list.files(path) # 其中存放者多个文件夹,每个文件夹下又有多个文件

symbol <- '.gz' # gz文件的标志

dir <- dirs[1]
fulldir <- paste(path, '/', dir, sep = '')
files <- list.files(fulldir)
gz <- files[grepl(symbol, files)] # 获取.gz文件的文件名
gz_full <- paste(fulldir, '/', gz, sep = '') # .gz文件的完整路径
# 读取压缩文件到数据框
content <- read.table(gzfile(gz_full), fill = TRUE, header = FALSE)
names(content) <- c("ENSG_ID",dir) # 貌似有些问题
LUAD_df <- content

# 数据整合

for (dir in dirs[2:length(dirs)]){
# 获取每个路径下的.gz文件
fulldir <- paste(path, '/', dir, sep = '')
files <- list.files(fulldir)
gz <- files[grepl(symbol, files)] # 获取.gz文件的文件名
gz_full <- paste(fulldir, '/', gz, sep = '') # .gz文件的完整路径
print(paste("Merging... ", gz))
# print(gz_full)
# 读取压缩文件到数据框
content <- read.table(gzfile(gz_full), fill = TRUE, header = FALSE)
names(content) <- c("ENSG_ID", dir) # ENSG_ID 作为排序合并
LUAD_df <- merge(LUAD_df, content, by = "ENSG_ID")
}
# Error: cannot allocate vector of size 96.9 Gb # 数据量太大无法处理, 故改用Python试试(TUDO)
# 使用一部分数据进行分析,但这只是权宜之策
num_gene <- dim(LUAD_df)[1] - 1
num_cases <- dim(LUAD_df)[2] - 1

write.csv(LUAD_df, paste("rna_", num_cases, "_cases_df.csv", sep = ""), row.names = FALSE)
print('Success!')


# 制作JSON请求文件
fout <- "Payload_test.txt"
payload <- file(fout, "w") # https://codeday.me/bug/20190117/538824.html,
for (dir in dirs){
# print(dir)
# write(paste('"', dir, '"', ",", sep = ""), payload, append = T) # 方式1,“w”一定得要,否则很玄学,被覆盖了
write(paste('"', dir, '"', ",", sep = ""), fout, append = T) # 方式2,直接文件名
}
close(payload)
# 执行指令
# curl --request POST --header "Content-Type:application/json" --data @Payload_test.txt 'https://api.gdc.cancer.gov/files' > ID_transfer_test.txt



###############################################
# 将LUAD_df的names更改为TCGA-SUBMIT-ID
ID_df <- read.csv("ID_transfer_test.txt", header = TRUE, sep = "\t")
write.csv(ID_df, "ID_df.csv", row.names = F)

Group_df <- data.frame( ID_df$cases.0.samples.0.submitter_id, ID_df$cases.0.samples.0.sample_type )
names(Group_df) <- c("TCGA_submit_ID", "Type")
write.csv(Group_df, "Group_df.csv")

# names(LUAD_df)中的ID,变换成ID_df中的submitid
new_names <- c()
old_names <- names(LUAD_df)
ids <- ID_df$id
submitterids <- ID_df$cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id
for (nm in old_names){
new_name <- submitterids[nm == ids ]
# print(as.character(new_name))
new_names <- c(new_names, as.character(new_name)) # https://www.cyclismo.org/tutorial/R/types.html

}
print(length(new_names))

LUAD_DAT <- LUAD_df
rownames(LUAD_DAT) <- LUAD_DAT[, 1] # 将第一列作为数据框的列名
LUAD_DAT <- LUAD_DAT[c(-1)]
names(LUAD_DAT) <- new_names
write.csv(LUAD_DAT, "LUAD_DAT.csv") # 处理完毕


###############################################
#引入软件包ballgown
# source("https://bioconductor.org/biocLite.R")
# biocLite("ballgown")
library("ballgown")


# 差异基因比较
result_diff <- stattest(gowntable = LUAD_DAT,
pData = Group_df,
covariate = "Type",
getFC = TRUE,
log = TRUE,
feature = "transcript"
)

write.csv(result_diff, "result_diff.csv")

print(dim(result_diff))


# 结果呈现
library(ggplot2)
# 火山图
threshold <- as.factor((result_diff$fc > 2 | result_diff$fc <0.5) & result_diff$pval < 0.05 & result_diff$qval < 0.01 )
ggplot(result_diff, aes(x = log2(result_diff$fc), y = -1*log10(result_diff$pval), colour = threshold )) + xlab("log2 fold-change") + ylab("-log10 p-value") + geom_point()
ggsave("LUAD_volcano.png")
ggsave("LUAD_volcano.pdf")
# 暂时不太懂什么意思,且原数据中有很多缺失
# Removed 8411 rows containing missing values (geom_point).


# 基因聚类
# install.packages("pheatmap")
library(pheatmap)

# 求差异基因
result_diff_filter <- result_diff[which(result_diff$pval < 0.05 & result_diff$qval < 0.01 & (result_diff$fc > 2 | result_diff$fc < 0.5 )),]
result_diff_filterGene <- result_diff_filter['id']
filter_data <- LUAD_DAT[result_diff_filterGene[, 1],]
filter_mat <- as.matrix.data.frame(filter_data)

dat <- matrix(as.numeric(filter_mat), ncol = num_cases)

# 标注
colnames(dat) <- as.character(Group_df[,1])
annotation_col <- data.frame(
CellType = factor(Group_df[, 2])
)
# rownames(annotation_col) <- colnames(dat)


# 类型颜色
ann_colors <- list(
CellType = c( "#1B9E77", "#D95F02")
)

pheatmap(log2(dat+1),
scale = "row",
clustering_distance_rows = "correlation",
# color = colorRampPalette(c("red", "black", "green"))(100),
# border_color = NA,
show_rownames = T,
show_colnames = T,
main ="TCGA Gene Expression Difference Between Tumor & Normal",
cluster_rows = TRUE,
cluster_cols = TRUE,
)