R CODE BAR

**##这一段是关于通过BiocManager安装包的代码**:

if (!requireNamespace("BiocManager", quietly = TRUE))

install.packages("BiocManager")&install.packages("corrplot",
**repos="<https://mirrors.tuna.tsinghua.edu.cn/CRAN>"**)

BiocManager::install(version = "3.15")

library("BiocManager")

BiocManager::install(c("ggplot2","ggtree","DESeq2"))

##这一段是关于librarian安装包的代码
library(librarian)  #加载包(包名称)
shelf(GEOquery,dplyr,tibble,limma)

#.libPaths( c( .libPaths(), "/home/r/R/x86_64-redhat-linux-gnu-library/4.1") )
#R语言修改libPath包的储存路径%libpaths%
#升级R IN VSCODE
install.packages("installr") 
library(installr)
updateR()

**#芯片数据注释**
library(AnnoProbe)
#选择要注释的探针类型
gpl='GPL16956'
#得到探针对应的基因名字
probe2gene=idmap(gpl,type = 'pipe')
#展示前10条结果
head(probe2gene)
**#画图**
png("normalmap.png")
normalmap <- boxplot(exp, outline = FALSE, notch = T, col = group_list, las = 2)
dev.off()
**##ggsave("corr_all.png", plot = last_plot(), device = "png", dpi = 300, width = 7, height = 5, units = "in")**

**#初阶**
rm(list = ls())**#一键清空环境变量#**
options(stringsAsFactors = F) **#转换格式#**
wd <- getwd()  **#文件定位#**
setwd(paste0(wd,"/geo_vas")) **#设置工作地点#**
f=paste0(gseAcc,'_eSet.Rdata') **#逗号前后连接#**
load(f) **#载入数据#**
class(gset)    **#查看数据类型#**
dt <-read.csv2("file",encoding = "uft-8",sep = ",")    **#打开csv只有一列#
data <- read.table("example.tsv", sep="\\t", header=TRUE)**   **#常规读取#**
write.csv(exp_gtex.tpm,file = "exp_gtex.tpm.csv",quote = FALSE)    **#结果保存#**
write.table(nrDEG,"limma_result2.txt",sep = "\\t",row.names = T,quote = F,col.names = T)   **#结果保存2#
#vlookup#    c = merge(a,b,by = '匹配条件',all=TRUE, sort=TRUE)**

**#proxy----#**
##这一段代码是用来r翻墙的
Sys.setenv(http_proxy="<http://127.0.0.1:7890>")
Sys.setenv(https_proxy="<http://127.0.0.1:7890>")
Sys.setenv(all_proxy="socks5://127.0.0.1:7890")

**##Ensembl_ID转换成SYMBOL##**
dt<- read.table("/Users/linfengni/Documents/数据库(TCGA+GEO)/GSE81089/limma_result2.txt",sep = "\\t",quote = "",fill = T,header = T)
dt[1:5,1:5]
data=data.frame(dt)
data[1:5,1:5]
colnames(data)[1] <- "ENSEMBL"library(stringi)##加载包
data$Ensembl_ID=stri_sub(data$Ensembl_ID,1,15)##保留前15位
#加载相关包
library(clusterProfiler)
library(org.Hs.eg.db)# 
查看org.Hs.eg.db 
包提供的转换类型
keytypes(org.Hs.eg.db)
# 需要转换的Ensembl_ID
Ensembl_ID <- data$Ensembl_IDtable[Ensembl_ID]
# 采用bitr()函数进行转换
gene_symbol <- bitr(Ensembl_ID, fromType="ENSEMBL", toType=c("SYMBOL"), OrgDb="org.Hs.eg.db")
# 查看转换的结果
head(gene_symbol)
# 两个数据进行vlookup匹配
dt = merge(gene_symbol ,data ,by = 'ENSEMBL',all=TRUE, sort=TRUE)
write.table(dt,"list2.txt",sep = "\\t",row.names = T,quote = F,col.names = T)
**##limma差异分析**

library(limma)
setwd("/Users/linfengni/Documents/数据库(TCGA+GEO)/GSE138172")
dt <-read.csv2("/Users/linfengni/Documents/数据库(TCGA+GEO)/GSE138172/GSE差异分析芯片数据.csv",encoding = "uft-8",sep = ",")
dim(dt)
dt[1:4,1:4]
group_list<-read.csv2("/Users/linfengni/Documents/数据库(TCGA+GEO)/GSE138172/分组信息.csv",encoding = "uft-8",sep = ",")
new_group<- group_list[order(group_list[,2]),] 
group<-new_group[,3]
table(group)
rownames(dt) <- dt$X
dt[1:4,1:4]
dt <- dt[,-1]
dt[1:4,1:4]
data <- dt
data<-as.data.frame(data)

design <- model.matrix(~0+factor(group))
**colnames(design)=c("test","ctrl")**
rownames(design)=colnames(data)

data=as.data.frame(lapply(data,as.numeric))
data3[3,3]
contrast.matrix<-makeContrasts("ctrl-test",levels=design)
data=as.data.frame(lapply(data,as.numeric))
rownames(data) <- rownames(dt)
data[3,3]
##step1
fit <- lmFit(data,design)
##step2
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)  
##step3
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG = na.omit(tempOutput) 
nrDEG[1:5,1:5]

write.table(nrDEG,"limma_result2.txt",sep = "\\t",row.names = T,quote = F,col.names = T)

**#非编码注释文件和平台探针匹配genesymbol#**

rm(list = ls())
load(file = "/Users/linfengni/Documents/数据库(TCGA+GEO)/非编码注释文件/GPL16956_probe_ID.Rdata")
exprSet  <- read.table("/Users/linfengni/Documents/数据库(TCGA+GEO)/GSE138172/limma_result3.txt",sep = "\\t",quote = "",fill = T,header = T)
exprSet <- cbind(probe_id = rownames(exprSet),exprSet)##把行名变成第一列
probe2ID

exprSet1 <- merge(probe2ID,exprSet,by = "probe_id")
write.table(exprSet1,"limma_result555.txt",sep = "\\t",row.names = T,quote = F,col.names = T)

**##GTEx数据库正常肺组织数据提取处理**
library(data.table) 
#表达矩阵
exp_gtex.tpm=read.table("/Users/linfengni/Documents/数据库(TCGA+GEO)/GTEx/gtex_RSEM_gene_tpm",header = T, sep = '\\t',check.names=FALSE)
rownames(exp_gtex.tpm)=exp_gtex.tpm[,1]
exp_gtex.tpm=exp_gtex.tpm[,-1]
exp_gtex.tpm[1:2,1:2]

#样本信息
data_cl=read.table("/Users/linfengni/Documents/数据库(TCGA+GEO)/GTEx/GTEX_phenotype",header = T, sep = '\\t')
data_cl=data_cl[,c(1,3)]
names(data_cl)=c('Barcode','Tissue')
data_cl=data_cl[data_cl$Tissue == 'Lung',] #筛选出lung的数据

annotat1=read.table("/Users/linfengni/Documents/数据库(TCGA+GEO)/GTEx/gencode.v23.annotation.gene.probemap",header = T, sep = '\\t')
annotat=annotat1[,c(1,2)]
rownames(annotat)=annotat[,1] #这里没有选择删去id这一列

#筛选,筛选之后还剩100个barcode
exp_gtex.tpmm <- exp_gtex.tpm
exp_gtex.tpmm[1:2,1:2]

rownames(exp_gtex.tpmm)=exp_gtex.tpmm[,1]

exp_gtex.tpmm[1:2,1:2]

exp_gtex.tpmm2=exp_gtex.tpmm[,colnames(exp_gtex.tpmm) %in% data_cl$Barcode]
exp_gtex.tpmm2[1:5,1:5]
#还原为TPM
exp_gtex.tpmm3=2^exp_gtex.tpmm2-0.001
exp_gtex.tpmm3[1:5,1:5]

exp_gtex.tpm <- exp_gtex.tpmm3

# rownames(exp_gtex.tpm)=exp_gtex.tpm[,1] 
# #基因注释
exp_gtex.tpm=as.matrix(exp_gtex.tpm)

t_index=intersect(rownames(exp_gtex.tpm),rownames(annotat)) #行名取交集,t_index中是能够进行注释的probe_id

exp_gtex.tpm=exp_gtex.tpm[t_index,]
annotat=annotat[t_index,]

exp_gtex.tpm[1:5,1:5]

**rownames(exp_gtex.tpm)=annotat$gene**

exp_gtex.tpm[1:5,1:5]

#去除重复基因名
t_index1=order(rowMeans(exp_gtex.tpm),decreasing = T)

t_data_order=exp_gtex.tpm[t_index1,]
keep=!duplicated(rownames(t_data_order))#对于有重复的基因,保留第一次出现的那个,即行平均值大的那个
exp_gtex.tpm=t_data_order[keep,]#得到最后处理之后的表达谱矩阵
table(exp_gtex.tpm)

#读出
write.csv(exp_gtex.tpm,file = "exp_gtex.tpm.csv",quote = FALSE)

**##TCGA数据库芯片数据归总到一个tsv的方法**
1、下载芯片数据
2、从每个文件夹中提取tsv%%搜索?.tsv得到左右的子文件夹中tsv后复制粘贴到一个新的文件夹中(包含所有芯片数据tsv)
#code
#设置芯片数据所在文件夹路径
outputFinder <- "/Users/linfengni/learning project/LUAD"
setwd("/Users/linfengni/learning project/LUAD/LUAD临床信息/gdc_download_20221029_025142.817701") 

json <- fromJSON(txt = list.files(pattern = ".json"))
MMRF_ID<-list()
matrix_MMRF<-list()
filedir_in_json<-list()
for (i in 1:nrow(json)) {
  MMRF_ID[i]<-json[[3]][[i]]
}
MMRF_ID<-do.call(rbind, MMRF_ID)
for (i in 1:nrow(json)) {
  filedir_in_json[i]<-json[i,]["file_id"]
}
filedir_in_json<-do.call(rbind, filedir_in_json)
filedir_in_json<-as.character(filedir_in_json)

#提取表达量至一个数据框(以tibble格式),counts值选4,fpkm选8,tpm选7
for (i in 1:length(filedir_in_json)) {
  setwd(paste0("./",filedir_in_json[i]))

  matrix_MMRF[[i]] = read.table(file = list.files(pattern = ".tsv"), header = F,fill = TRUE)[,7]
  setwd("../")
}
setwd(paste0("./",filedir_in_json[i]))
probematrix<-read.table( list.files(pattern = ".tsv"), header = F,fill = TRUE)[,2:3]

matrix_MMRF<-do.call(cbind, matrix_MMRF)
matrix_MMRF<-cbind(probematrix,matrix_MMRF)
tibble_MMRF <- tibble::as_tibble(matrix_MMRF)
colnames(tibble_MMRF)<-c("gene_name","gene_type",MMRF_ID[,1])

#tibble_MMRF@这是一个把所有样本归到一个表格后&以文件形式导出
setwd(outputFinder)
write.table(tibble_MMRF,"tibble_MMRF.txt",sep = "\\t",row.names = F,col.names = T,quote = F)

#如何从芯片数据中挑选目标数据group&LncRNA、mRNA...
pcg = c("lncRNA")
mRNA_exprSet <- tibble_MMRF %>% 
  dplyr::filter(gene_type%in%pcg) %>%
  dplyr::select(-gene_type) %>%
  dplyr::distinct(gene_name, .keep_all =TRUE)
#限制列名的字符长度 
colnames(mRNA_exprSet) <- str_sub(colnames(mRNA_exprSet),1,16)
##这里筛除重复样本
mRNA_exprSet2 <- mRNA_exprSet[,-which(duplicated(colnames(mRNA_exprSet)))]
which(duplicated(colnames(mRNA_exprSet)))

#利用正则表达式把test和control分开 
mRNA_exprSet3 <- mRNA_exprSet2 %>%
  select(-matches(".-[0-9]{2}[^A]$")) %>%
  select(-matches(".-0[1-9][A]$"), everything())

mRNA_exprSet_control <- mRNA_exprSet3 %>% select(-matches(".-0[1-9][A]$"))
mRNA_exprSet_test <- mRNA_exprSet3 %>% select(matches(".-0[1-9][A]$"))
mRNA_exprSet <- cbind(mRNA_exprSet_control, mRNA_exprSet_test)

#colnames(mRNA_exprSet_control)
samplenames <- as.data.frame(colnames(mRNA_exprSet))
samplenames$Group <- "NHSC" #这里是group类别
samplenames$Group[which((str_detect(samplenames[,1], ".-[^0][0-9][A]$") == T))] = "normal"
samplenames$Group[which((str_detect(samplenames[,1], ".-0[1-9][A]$") == T))] = "cancer"
table(samplenames$Group)
samplefile <- samplenames
colnames(samplefile) <- c("Sample_Name","Sample_Group")
write.table(samplefile,"samplefile.txt",sep = "\\t",row.names = F,quote = F,col.names = T)  ##samplefile文件是两列矩阵,包含对象ID和分组信息
#提取临床信息
colname_exprSet <- colnames(mRNA_exprSet)
colnames(mRNA_exprSet) <- str_sub(colnames(mRNA_exprSet),1,12)
clinic <- read.table("/Users/linfengni/learning project/LUAD/LUAD临床信息/临床数据/clinical.cart.2022-10-30/clinical.tsv",sep = "\\t",quote = "",fill = T,header = T)
mRNA_exprSet <- mRNA_exprSet[,c("gene_name",intersect(colnames(mRNA_exprSet),clinic[,2]))]                        ##intersect用来匹配两者相同值
#临床信息提炼
clinic <- read.table("/Users/linfengni/learning project/LUAD/LUAD临床信息/临床数据/clinical.cart.2022-10-30/clinical.tsv",sep = "\\t",quote = "",fill = T,header = T)

clinical_trait <- clinic %>%
  dplyr::select(case_submitter_id,age_at_index,ajcc_pathologic_m,ajcc_pathologic_n,ajcc_pathologic_stage,ajcc_pathologic_t,gender,vital_status,days_to_last_follow_up, site_of_resection_or_biopsy,treatment_type) %>%
  distinct(case_submitter_id,.keep_all = TRUE)  
setwd(outputFinder)
write.table(clinical_trait,"clinical_trait.txt",sep = "\\t",row.names = F,col.names = T,quote = F)                       ##输出精炼后的临床信息数据

#重新排序,将癌旁排在前面便于下一步筛选,0-9为癌数据,排在后面
mRNA_exprSet <- mRNA_exprSet %>%
  select(-matches(".-[0-9]{2}[^A]$")) %>%
  select(-matches(".-0[1-9][A]$"), everything())
setwd(outputFinder)
write.table(mRNA_exprSet,"expSet_TPM_ProCoding.txt",sep = "\\t",row.names = F,col.names = T,quote = F)
#这里得到的是一个mRNA_exprSet.txt文件,包含芯片数据和临床信息相匹配的汇总芯片矩阵信息

#得到表达矩阵行名,也就是带有相同顺序的基因名
rownames_mRNA_exprSet <- mRNA_exprSet[,1]
mRNA_exprSet$gene_name <- NULL   &&把这一类取消 

#写出samplenames
samplenames <- as.data.frame(colnames(mRNA_exprSet))
samplenames$Group <- "HNSC"
samplenames$Group[which((str_detect(samplenames[,1], ".-[^0][0-9][A]$") == T))] = "HC"
samplenames$Group[which((str_detect(samplenames[,1], ".-0[1-9][A]$") == T))] = "HNSC"
table(samplenames$Group)
samplefile <- samplenames
colnames(samplefile) <- c("Sample_Name","Sample_Group")
write.table(samplefile,"samplefile.txt",sep = "\\t",row.names = F,quote = F,col.names = T)

save( mRNA_exprSet, rownames_mRNA_exprSet, samplefile, file = "expSetTpm_TCGA.rda")
**# 下载GEO表达矩阵**

Sys.setenv(http_proxy="<http://127.0.0.1:7890>")
Sys.setenv(https_proxy="<http://127.0.0.1:7890>")
Sys.setenv(all_proxy="socks5://127.0.0.1:7890")
rm(list = ls())
rootDir <- paste0("~","/Rstudio")
setwd(rootDir)
options(stringsAsFactors = F)
Sys.setenv("VROOM_CONNECTION_SIZE"=500072)

library(librarian)
shelf(GEOquery,dplyr,tibble,limma,affy)
 
gseAcc <- "GSE30219"
f=paste0(gseAcc,'_eSet.Rdata')

if(!file.exists(f)){
  gset <- getGEO(gseAcc, destdir=".",
                 AnnotGPL = F,
                 getGPL = F)
  save(gset,file=f)
}

**# 提取GEO表达矩阵----这段代码把测序原始ID转换成了symbol**
rm(list = ls())
GPLfilename <- "/Users/sandy/Downloads/丹阳黄泰复现头颈鳞癌项目/GEO/GPL10558-50081.txt"
ann <- read.table(GPLfilename, sep = "\\t", quote = "", header = T, comment.char = "#", fill = T)
ID2symbol <- ann[,c("ID","Symbol")]

expFilePath <- "/Users/sandy/Downloads/丹阳黄泰复现头颈鳞癌项目/GEO/GSE65858_series_matrix.txt"
exp <- read.table(expFilePath,sep = "\\t",quote = "", fill = T,header = T, comment.char = "!", check.names = F)
library(stringr)
exp[,1] <- str_remove_all(pattern = "\\"",exp[,1])
colnames(exp) <- str_remove_all(pattern = "\\"",colnames(exp))

library(dplyr)
colnames(ID2symbol)[1] <- colnames(exp)[1]
exp_sym <- right_join(ID2symbol,exp)
exp_sym <- exp_sym[,-1]
geoGeneSymblol <- exp_sym[,1]
save(geoGeneSymblol,file = "geoGeneSymblol.rda")
save(exp_sym , file = "geo_exp_sym.rda")

#TCGA数据处理----
#TCGA无需combat移除批处理影响
rm(list = ls())
setwd("/Users/sandy/Downloads/丹阳黄泰复现头颈鳞癌项目/复现完整过程")
load("expSetTpm_TCGA.rda")
#转换为纯数字矩阵
#exp <- mRNA_exprSet
#dimnames=list(as.matrix(rownames_mRNA_exprSet)[,1],colnames(exp))
#raw_data <- matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
raw_data <- as.data.frame(log2(raw_data+1))
range(raw_data)
#与验证集取交集
load("geoGeneSymblol.rda")
raw_data1 <- raw_data[rownames(raw_data) %in% intersect(rownames(raw_data),geoGeneSymblol),]

#癌症组
tmp <- raw_data1[, colnames(raw_data1) %in% samplefile[samplefile$Sample_Group=="HNSC",]$Sample_Name]
dim(tmp)
c <- cor(tmp, method = "spearman")
library(pheatmap)
pheatmap(c)  #这里会报错,数据set不符合pheatmap格式!
rMeans <- data.frame(rowMeans(c))
HNSC_tmp1 <- tmp[, rowMeans(c) > 0.85]
dim(HNSC_tmp1)
c1 <- cor(HNSC_tmp1, method = "spearman")
c2 <- cbind(raw_data1[,"gene_name"],tmp)  ##这里是为了符合pheatmap矩阵
pheatmap(c2)

#正常组
tmp2 = raw_data1[, samplefile[samplefile$Sample_Group=="HC",]$Sample_Name]
dim(tmp2)
c2 <- cor(tmp2, method = "spearman")
pheatmap(c2)
tmp3 <- tmp2[, rowMeans(c2) > 0.85]
dim(tmp3)
c3 <- cor(tmp3, method = "spearman")
pheatmap(c3)

#表达量总矩阵
exprSet=cbind(HNSC_tmp1,tmp3)
range(exprSet)
samplefile=samplefile[match(colnames(exprSet),samplefile$Sample_Name),]
#保存癌症相关表达矩阵,总矩阵,样本分型
save(HNSC_tmp1,exprSet, samplefile, file="step2combatHNSC-0.85.rda")
#R中如何将第一列序号改成基因名称
rownames(data)=data[,1]  #取出第一列
data=data[,-1]          #将第一列删除
head(data)
#临床信息如何分组
rm(list=ls())
setwd('/Users/linfengni/learning project/LUAD/test')
library(dplyr)
library(survival)
library(survminer)
outputFinder <- "/Users/linfengni/learning project/LUAD/test"

clinic <- read.table("/Users/linfengni/learning project/LUAD/LUAD临床信息/临床数据/clinical.cart.2022-10-30/clinical.tsv",sep = "\\t",quote = "",fill = T,header = T)

clinical_trait <- clinic %>%
  dplyr::select(case_submitter_id,age_at_index,ajcc_pathologic_m,ajcc_pathologic_n,ajcc_pathologic_stage,ajcc_pathologic_t,gender,vital_status,days_to_last_follow_up, site_of_resection_or_biopsy,treatment_type,days_to_death) %>%
  distinct(case_submitter_id,.keep_all = TRUE)  
setwd(outputFinder)
write.table(clinical_trait,"clinical_trait.txt",sep = "\\t",row.names = F,col.names = T,quote = F)
library(reshape)
#死亡和存活患者的临床信息进行分组
dead_patient <- clinical_trait %>%
  dplyr::filter(vital_status == 'Dead') %>%
  dplyr::select(-days_to_last_follow_up) %>%
  mutate(vital_status=ifelse(vital_status=='Dead',1,0))

alive_patient <- clinical_trait %>%
  dplyr::filter(vital_status == 'Alive') %>%
  dplyr::select(-days_to_last_follow_up) %>%
  mutate(vital_status=ifelse(vital_status=='Dead',1,0))

#R做卡方检验
tableR <- matrix(c(79,37,239,129),nrow=2,ncol=2)
chisq.test(tableR)

#单因素cox分析
library("survival")
library("survminer")
res.cox <- coxph(Surv(OS.Time, OS) ~ Gender, data = sex_group)
res.cox
summary(res.cox)

#多因素cox分析
res.cox <- coxph(Surv(OS.Time, OS) ~ Age + Gender, data =  survival_data)
summary(res.cox)

#df