mouse_se_transcription
HYX
5/28/2021
opts_knit$set(progress = TRUE,verbose = TRUE)
转录组下游基础分析
检测数据的相关系数和批次效应
通过对原始Count矩阵进行相关系数的计算和聚类,可以初步评价转录组的质量。组内相关系数应该大于0.9,若组内相关系数小于组间相关系数,应考虑样本污染或错误。
setwd("~/bioinformatics/sun/yjc/xzj")
library(readr)
library(FactoMineR)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(corrplot)
## corrplot 0.84 loaded
total_uniqmap <- read_delim("total_uniqmap.txt",
"\t", escape_double = FALSE, comment = "#",
trim_ws = TRUE)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Geneid = col_character(),
## Chr = col_character(),
## Start = col_character(),
## End = col_character(),
## Strand = col_character(),
## Length = col_double(),
## C1_S133.bam = col_double(),
## C3_S175.bam = col_double(),
## clean_C2_S178.bam = col_double(),
## D1_S228.bam = col_double(),
## D2_S229.bam = col_double(),
## D3_S134.bam = col_double(),
## H1_S132.bam = col_double(),
## H2_S130.bam = col_double(),
## H3_S177.bam = col_double()
## )
head(total_uniqmap)
## # A tibble: 6 x 15
## Geneid Chr Start End Strand Length C1_S133.bam C3_S175.bam
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 ENSMUSG… chr1 3143476 3144545 + 1070 2 5
## 2 ENSMUSG… chr1 3172239 3172348 + 110 0 0
## 3 ENSMUSG… chr1;chr… 3276124;3… 3277540;3… -;-;-… 6094 0 0
## 4 ENSMUSG… chr1 3322980 3323459 + 480 0 0
## 5 ENSMUSG… chr1 3435954 3438772 - 2819 0 1
## 6 ENSMUSG… chr1 3445779 3448011 - 2233 0 0
## # … with 7 more variables: clean_C2_S178.bam <dbl>, D1_S228.bam <dbl>,
## # D2_S229.bam <dbl>, D3_S134.bam <dbl>, H1_S132.bam <dbl>, H2_S130.bam <dbl>,
## # H3_S177.bam <dbl>
#通过观察,发现Geneid存在版本号,不利于下游ID转换,用正则替换去掉
total_uniqmap$Geneid<-gsub("\\..*", "", total_uniqmap$Geneid)
head(total_uniqmap)
## # A tibble: 6 x 15
## Geneid Chr Start End Strand Length C1_S133.bam C3_S175.bam
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 ENSMUSG… chr1 3143476 3144545 + 1070 2 5
## 2 ENSMUSG… chr1 3172239 3172348 + 110 0 0
## 3 ENSMUSG… chr1;chr… 3276124;3… 3277540;3… -;-;-… 6094 0 0
## 4 ENSMUSG… chr1 3322980 3323459 + 480 0 0
## 5 ENSMUSG… chr1 3435954 3438772 - 2819 0 1
## 6 ENSMUSG… chr1 3445779 3448011 - 2233 0 0
## # … with 7 more variables: clean_C2_S178.bam <dbl>, D1_S228.bam <dbl>,
## # D2_S229.bam <dbl>, D3_S134.bam <dbl>, H1_S132.bam <dbl>, H2_S130.bam <dbl>,
## # H3_S177.bam <dbl>
##可以看到表达矩阵是7:15列
corrplot::corrplot(cor(total_uniqmap[,c(7:15)]))
expr<-total_uniqmap[,c(1,7:15)]
grouplist=factor(c("C","C","C","D","D","D","H","H","H"))
dat.pca<-PCA(t(expr[,c(2:10)]),graph = FALSE)
dat.pca
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 9 individuals, described by 55416 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
fviz_pca_ind(dat.pca,ind="point",col.ind=grouplist,legend.title="Groups")
如果不存在明显的批次效应和测序污染,可以直接进行下游分析 ## 进行转录组分析 log2foldchage取1,即表达量变化两倍以上的认为是上调或下调基因。pvalue经过FDR校正,取0.05
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
count_data=expr[,c(2:7)]
row.names(count_data)<-total_uniqmap$Geneid
## Warning: Setting row names on a tibble is deprecated.
condition<-factor(c("C","C","C","D","D","D"),levels=c("C","D"))#想清楚Control的位置
col_data<-data.frame(row.names=colnames(count_data),condition)
dds<-DESeqDataSetFromMatrix(countData=count_data,colData=col_data,design=~condition)
## converting counts to integer mode
dds_filter<-dds[rowSums(counts(dds))>1,]#样本基因表达量之和大于1
dds_out<-DESeq(dds_filter)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
expr_new<-counts(dds_out)
res<-results(dds_out)
res2<-as.data.frame(res)#保存成表格格式
res2$g=ifelse(res2$pvalue>0.05,'stable',
ifelse( res2$log2FoldChange >1,'up',
ifelse( res2$log2FoldChange < -1,'down','stable') )
)
res2$v= -log10(res2$padj)
library(org.Mm.eg.db)
## Loading required package: AnnotationDbi
##
name<-select(org.Mm.eg.db,keys=rownames(res),columns = "SYMBOL",keytype = "ENSEMBL")
## 'select()' returned 1:many mapping between keys and columns
name2<-name[!duplicated(name$ENSEMBL),]
res2$ENSEMBL=rownames(res2)
res3<-merge(name2,res2,by="ENSEMBL")
name<-select(org.Mm.eg.db,keys=rownames(res),columns = "ENTREZID",keytype = "ENSEMBL")
## 'select()' returned 1:many mapping between keys and columns
name2<-name[!duplicated(name$ENSEMBL),]
res3<-merge(name2,res3,by="ENSEMBL")
table(res3$g)
##
## down stable up
## 219 35651 173
library(ggpubr)
ggpubr::ggscatter(res3, x = "log2FoldChange", y = "v", color = "g",size = 0.5,palette = c("#00AFBB", "#E7B800", "#FC4E07") )
## Warning: Removed 20270 rows containing missing values (geom_point).
x=res3$log2FoldChange
names(x)=rownames(res3)
cg=c(names(head(sort(x),100)),
names(tail(sort(x),100)))
library(pheatmap)
n<-t(scale(t(as.matrix(expr_new[res3[cg,]$ENSEMBL,]))))
n[n>4]=4
n[n< -4]= -4
ac=data.frame(groupList=grouplist[1:6])
rownames(ac)=colnames(n)
pheatmap(n,show_colnames =F,show_rownames = F,
annotation_col=ac)
write.csv(res3,col.names = F,"CD.csv")
## Warning in write.csv(res3, col.names = F, "CD.csv"): attempt to set 'col.names'
## ignored
后续富集分析
KEGG和GO分析
library(clusterProfiler)
##
## clusterProfiler v3.18.1 For help: https://guangchuangyu.github.io/software/clusterProfiler
##
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
upgene<-res3[which(res3$g=="up"),]
upgene$ENTREZID
## [1] "16414" "57435" "11768" "54153" "18423" "53321"
## [7] "71774" "242705" "13195" "192897" "668303" "18810"
## [13] "23872" "675812" "20532" "12511" "21908" "214951"
## [19] "227737" "70789" "140483" "11924" "21942" "11787"
## [25] "24110" "14562" "21946" "18726" "11784" "12310"
## [31] "320707" "71607" "11806" "72446" "12733" "26878"
## [37] "104382" "231440" "241770" "27261" "216152" "235472"
## [43] "13654" "75607" "380713" "232371" "13799" "380732"
## [49] "227394" "68723" "328795" "15438" "100042416" "224912"
## [55] "140497" "110380" "327747" "328643" NA "16485"
## [61] "238331" "227357" "244418" "234695" "101122" "74748"
## [67] "235584" NA "330189" NA "17879" "71797"
## [73] "22253" "71453" "22139" "435626" "12399" "668208"
## [79] "194604" NA "629114" "373852" "404711" "791260"
## [85] "100042480" NA "619991" "108167891" "100038997" NA
## [91] NA NA "100039775" NA "67238" NA
## [97] NA "100312540" NA "108168715" NA NA
## [103] "100415901" NA NA "100040298" NA NA
## [109] NA NA NA "331195" "102636792" "115486427"
## [115] "67765" "619312" "100042074" NA NA NA
## [121] NA NA NA NA NA NA
## [127] NA NA "353235" NA NA NA
## [133] NA NA "115490131" "108169122" NA NA
## [139] "100384878" NA NA NA NA NA
## [145] NA NA NA NA NA NA
## [151] NA NA NA NA NA NA
## [157] "674228" "100418227" NA NA "665682" NA
## [163] NA "102632818" NA "100312474" NA NA
## [169] NA NA NA "100861647" NA
goup<- enrichGO(upgene$ENTREZID, OrgDb = org.Mm.eg.db, ont='ALL',pAdjustMethod = 'BH',pvalueCutoff = 1,qvalueCutoff = 1,keyType = 'ENTREZID',readable = T)
downgene<-res3[which(res3$g=="down"),]
godown<- enrichGO(downgene$ENTREZID, OrgDb = org.Mm.eg.db, ont='ALL',pAdjustMethod = 'BH',pvalueCutoff = 1,qvalueCutoff = 1,keyType = 'ENTREZID',readable = T)
head(goup)
## ONTOLOGY ID Description GeneRatio
## GO:0007422 BP GO:0007422 peripheral nervous system development 4/82
## GO:0007411 BP GO:0007411 axon guidance 6/82
## GO:0097485 BP GO:0097485 neuron projection guidance 6/82
## GO:0014037 BP GO:0014037 Schwann cell differentiation 3/82
## GO:0090009 BP GO:0090009 primitive streak formation 2/82
## GO:0045165 BP GO:0045165 cell fate commitment 6/82
## BgRatio pvalue p.adjust qvalue
## GO:0007422 74/23328 0.0001353099 0.1154813 0.1053452
## GO:0007411 251/23328 0.0002584882 0.1154813 0.1053452
## GO:0097485 252/23328 0.0002640578 0.1154813 0.1053452
## GO:0014037 41/23328 0.0004052297 0.1232602 0.1124413
## GO:0090009 10/23328 0.0005392984 0.1232602 0.1124413
## GO:0045165 292/23328 0.0005766773 0.1232602 0.1124413
## geneID Count
## GO:0007422 Cntnap1/Itgb4/Egr2/Hoxd9 4
## GO:0007411 Neurog2/Apbb2/Prtg/Egr2/Unc5c/Runx3 6
## GO:0097485 Neurog2/Apbb2/Prtg/Egr2/Unc5c/Runx3 6
## GO:0014037 Cntnap1/Itgb4/Egr2 3
## GO:0090009 Ets2/Gdf3 2
## GO:0045165 Ets2/Tlx1/Neurog2/Gdf3/Barhl2/Slamf8 6
head(godown)
## ONTOLOGY ID
## GO:0070293 BP GO:0070293
## GO:0002507 BP GO:0002507
## GO:0031638 BP GO:0031638
## GO:0098953 BP GO:0098953
## GO:0098970 BP GO:0098970
## GO:0099628 BP GO:0099628
## Description GeneRatio
## GO:0070293 renal absorption 2/73
## GO:0002507 tolerance induction 2/73
## GO:0031638 zymogen activation 2/73
## GO:0098953 receptor diffusion trapping 1/73
## GO:0098970 postsynaptic neurotransmitter receptor diffusion trapping 1/73
## GO:0099628 neurotransmitter receptor diffusion trapping 1/73
## BgRatio pvalue p.adjust qvalue geneID Count
## GO:0070293 17/23328 0.001274225 0.5287707 0.5239115 Slc15a2/Aqp3 2
## GO:0002507 29/23328 0.003712749 0.5287707 0.5239115 Il2ra/Ccr4 2
## GO:0031638 69/23328 0.019796163 0.5287707 0.5239115 Klk1b4/Klk1b3 2
## GO:0098953 10/23328 0.030861735 0.5287707 0.5239115 Cacng7 1
## GO:0098970 10/23328 0.030861735 0.5287707 0.5239115 Cacng7 1
## GO:0099628 10/23328 0.030861735 0.5287707 0.5239115 Cacng7 1
geneList<-res3$log2FoldChange
names(geneList)<-res3$SYMBOL
geneList<-sort(geneList,decreasing = T)
write.csv(goup@result,col.names = F,row.names = F,"CD_goup_results.csv")
## Warning in write.csv(goup@result, col.names = F, row.names = F,
## "CD_goup_results.csv"): attempt to set 'col.names' ignored
write.csv(godown@result,col.names = F,row.names = F,"CD_godown_results.csv")
## Warning in write.csv(godown@result, col.names = F, row.names = F,
## "CD_godown_results.csv"): attempt to set 'col.names' ignored
上调基因GO
barplot(goup,showCategory=30,drop=T)
cnetplot(goup,categorySiza="pvalue",foldChange = geneList,circular=TRUE)
下调基因GO
barplot(godown,showCategory=30,drop=T)
cnetplot(godown,categorySiza="pvalue",foldChange = geneList,circular=TRUE)
###上调基因KEGG
# Sys.setenv(http_proxy="http://127.0.0.1:8001")
kkup <- enrichKEGG(
gene = upgene$ENTREZID,
keyType = 'kegg',
organism = 'mmu',
pvalueCutoff = 1,qvalueCutoff = 1)
## Reading KEGG annotation online:
##
## Reading KEGG annotation online:
kkup2<-setReadable(kkup,'org.Mm.eg.db','ENTREZID')
dotplot(kkup, showCategory = 20)
## wrong orderBy parameter; set to default `orderBy = "x"`
cnetplot(kkup2,categorySiza="pvalue",foldChange = geneList,circular=TRUE)
下调基因KEGG
kkdown <- enrichKEGG(
gene = downgene$ENTREZID,
keyType = 'kegg',
organism = 'mmu',
pvalueCutoff = 1,qvalueCutoff = 1)
kkdown2<-setReadable(kkdown,'org.Mm.eg.db','ENTREZID')
dotplot(kkdown, showCategory = 20)
## wrong orderBy parameter; set to default `orderBy = "x"`
cnetplot(kkdown2,categorySiza="pvalue",foldChange = geneList,circular=TRUE)
write.csv(kkup2@result,row.names = F,col.names = F,"CD_kegg_up.csv")
## Warning in write.csv(kkup2@result, row.names = F, col.names = F,
## "CD_kegg_up.csv"): attempt to set 'col.names' ignored
write.csv(kkdown2@result,row.names = F,col.names = F,"CD_kegg_down.csv" )
## Warning in write.csv(kkdown2@result, row.names = F, col.names = F,
## "CD_kegg_down.csv"): attempt to set 'col.names' ignored
GSEA
geneList=res3$log2FoldChange
names(geneList)=res3$ENTREZID
geneList=sort(geneList,decreasing = T)
#data(geneList, package="DOSE")
kk_gse <- gseKEGG(geneList = geneList, organism = "mmu", nPerm = 1000, minGSSize = 120,pvalueCutoff = 1,verbose = FALSE)
## Warning in .GSEA(geneList = geneList, exponent = exponent, minGSSize =
## minGSSize, : We do not recommend using nPerm parameter incurrent and future
## releases
## Warning in fgsea(pathways = geneSets, stats = geneList, nperm = nPerm, minSize
## = minGSSize, : You are trying to run fgseaSimple. It is recommended to use
## fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument
## in the fgsea function call.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (15.11% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
library(enrichplot)
##
## Attaching package: 'enrichplot'
## The following object is masked from 'package:ggpubr':
##
## color_palette
gseaplot2(kk_gse,geneSetID = c(1,2,3))
gseaplot2(kk_gse,geneSetID = c(4,5,6))
gseaplot2(kk_gse,geneSetID = c(7,8,9))
gseago = gseGO(geneList ,
OrgDb = org.Mm.eg.db, ont = 'all', nPerm = 1000, minGSSize = 10, pvalueCutoff = 1, verbose = FALSE)
## Warning in .GSEA(geneList = geneList, exponent = exponent, minGSSize =
## minGSSize, : We do not recommend using nPerm parameter incurrent and future
## releases
## Warning in fgsea(pathways = geneSets, stats = geneList, nperm = nPerm, minSize
## = minGSSize, : You are trying to run fgseaSimple. It is recommended to use
## fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument
## in the fgsea function call.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (15.11% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
gseaplot2(gseago,geneSetID = c(1,2,3),pvalue_table = TRUE)
gseaplot2(gseago,geneSetID = c(4,5,6),pvalue_table = TRUE)
gseaplot2(gseago,geneSetID = c(7,8,9),pvalue_table = TRUE)
write.csv(kk_gse@result,row.names = F,col.names = F,"CD_KEGG_GSEA.csv")
## Warning in write.csv(kk_gse@result, row.names = F, col.names = F,
## "CD_KEGG_GSEA.csv"): attempt to set 'col.names' ignored
write.csv(gseago@result,row.names = F,col.names = F,"CD_GO_GSEA.csv")
## Warning in write.csv(gseago@result, row.names = F, col.names = F,
## "CD_GO_GSEA.csv"): attempt to set 'col.names' ignored