TxRegQuery addresses exploration of transcriptional regulatory networks by integrating data on eQTL, digital genomic footprinting (DGF), DnaseI hypersensitivity binding data (DHS), and transcription factor binding site (TFBS) data. Owing to the volume of emerging tissue-specific data, special data modalities are used.
txregnet
databaseWe have a long-running server that will respond to queries. We focus on mongolite as the interface.
suppressPackageStartupMessages({
library(TxRegInfra)
library(mongolite)
library(Gviz)
library(EnsDb.Hsapiens.v75)
library(BiocParallel)
register(SerialParam())
})
con1 = mongo(url=URL_txregInAWS(), db="txregnet")
con1
## <Mongo collection> 'test'
## $aggregate(pipeline = "{}", options = "{\"allowDiskUse\":true}", handler = NULL, pagesize = 1000, iterate = FALSE)
## $count(query = "{}")
## $disconnect(gc = TRUE)
## $distinct(key, query = "{}")
## $drop()
## $export(con = stdout(), bson = FALSE, query = "{}", fields = "{}", sort = "{\"_id\":1}")
## $find(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0, handler = NULL, pagesize = 1000)
## $import(con, bson = FALSE)
## $index(add = NULL, remove = NULL)
## $info()
## $insert(data, pagesize = 1000, stop_on_error = TRUE, ...)
## $iterate(query = "{}", fields = "{\"_id\":0}", sort = "{}", skip = 0, limit = 0)
## $mapreduce(map, reduce, query = "{}", sort = "{}", limit = 0, out = NULL, scope = NULL)
## $remove(query, just_one = FALSE)
## $rename(name, db = NULL)
## $replace(query, update = "{}", upsert = FALSE)
## $run(command = "{\"ping\": 1}", simplify = TRUE)
## $update(query, update = "{\"$set\":{}}", filters = NULL, upsert = FALSE, multiple = FALSE)
We will write methods that work with the ‘fields’ of this object.
There is not much explicit reflectance in the mongolite API. The following is improvised and may be fragile:
## $name
## [1] "test"
##
## $db
## [1] "txregnet"
##
## $url
## [1] "mongodb+srv://user:user123@txregnet-kui9i.mongodb.net/txregnet"
##
## $options
## List of 6
## $ pem_file : NULL
## $ ca_file : NULL
## $ ca_dir : NULL
## $ crl_file : NULL
## $ allow_invalid_hostname: logi FALSE
## $ weak_cert_validation : logi FALSE
If the mongo
utility is available as a system
command, we can get a list of collections in the database
as follows.
## Error in system2(cmd, args = "--help", stdout = TRUE, stderr = TRUE) :
## error in running command
## install mongodb on your system to use this function
Otherwise, as long as mongolite is installed, as long as we know the collection names of interest, we can use them as noted throughout this vignette.
We can get a record from a given collection:
mongo(url=URL_txregInAWS(), db="txregnet",
collection="Adipose_Subcutaneous_allpairs_v7_eQTL")$find(limit=1)
## gene_id variant_id tss_distance ma_samples ma_count
## 1 ENSG00000238009.2 1_807994_C_T_b37 678771 17 18
## maf pval_nominal slope slope_se qvalue chr snp_pos A1 A2
## 1 0.0233766 0.00126668 -0.759564 0.233531 0.0742794 1 807994 C T
## build
## 1 b37
Queries can be composed using JSON. We have a tool to generate queries that employ the mongodb aggregation method. Here we demonstrate this by computing, for each chromosome, the count and minimum values of the footprint statistic on CD14 cells.
m1 = mongo(url = URL_txregInAWS(), db = "txregnet", collection="CD14_DS17215_hg19_FP")
newagg = makeAggregator( by="chr", vbl="stat", op="$min", opname="min")
The JSON layout of this aggregating query is
[
{
"$group": {
"_id": ["$chr"],
"count": {
"$sum": [1]
},
"min": {
"$min": ["$stat"]
}
}
}
]
Invocation returns a data frame:
## _id count min
## 1 chrY 827 0.01907390
## 2 chr18 15868 0.06107950
## 3 chr10 40267 0.00601357
## 4 chr4 32947 0.02776440
## 5 chr6 54728 0.00565057
## 6 chr17 47987 0.01242310
We need to bind the metadata and information about the mongodb.
The following turns a very ad hoc filtering of the collection names into a DataFrame.
## DataFrame with 2 rows and 3 columns
## base type
## <character> <character>
## Adipose_Subcutaneous_allpairs_v7_eQTL Adipose eQTL
## Adipose_Visceral_Omentum_allpairs_v7_eQTL Adipose eQTL
## mid
## <character>
## Adipose_Subcutaneous_allpairs_v7_eQTL Subcutaneous_allpairs_v7
## Adipose_Visceral_Omentum_allpairs_v7_eQTL Visceral_Omentum_allpairs_v7
A key method in development is subsetting the archive by genomic coordinates.
si = GenomeInfoDb::Seqinfo(genome="hg19")["chr17"] # to fix query genome
myg = GRanges("chr17", IRanges(38.07e6,38.09e6), seqinfo=si)
s1 = sbov(rme1, myg, simplify=FALSE)
## ..........................................
## class: RaggedExperiment
## dim: 1676 42
## assays(3): chr id stat
## rownames: NULL
## colnames(42): CD14_DS17215_hg19_FP CD19_DS17186_hg19_FP ...
## iPS_19_11_DS15153_hg19_FP vHMEC_DS18406_hg19_FP
## colData names(6): base type ... type mid
## [1] 1676 42
## fLung_DS14724_hg19_FP fMuscle_arm_DS17765_hg19_FP
## chr17:38084160-38084169 0.533333 NA
## chr17:38084924-38084952 0.890476 NA
## chr17:38080857-38080891 NA 0.54902
## chr17:38081914-38081926 NA 0.50000
ormm = txmodels("ORMDL3", plot=FALSE, name="ORMDL3")
sar = strsplit(rownames(sa), ":|-")
an = as.numeric
gr = GRanges(seqnames(ormm)[1], IRanges(an(sapply(sar,"[", 2)), an(sapply(sar,"[", 3))))
gr1 = gr
gr1$score = 1-sa[,1]
gr2 = gr
gr2$score = 1-sa[,2]
sc1 = DataTrack(gr1, name="Lung FP")
sc2 = DataTrack(gr2, name="Musc/Arm FP")
plotTracks(list(GenomeAxisTrack(), sc1, sc2, ormm), showId=TRUE)
sbov
We begin with three ‘single-concept’ assays with relevance to lung genomics. The v7 GTEx lung eQTL data, an encode DnaseI narrowPeak report on lung fibroblasts, and a digital genomic footprint report for fetal lung.
lname_eqtl = "Lung_allpairs_v7_eQTL"
lname_dhs = "ENCFF001SSA_hg19_HS" # see dnmeta, fibroblast of lung
lname_fp = "fLung_DS14724_hg19_FP"
si17 = GenomeInfoDb::Seqinfo(genome="hg19")["chr17"]
si17n = si17
GenomeInfoDb::seqlevelsStyle(si17n) = "NCBI"
s1 = sbov(rme0[,lname_eqtl], GRanges("17", IRanges(38.06e6, 38.15e6),
seqinfo=si17n))
## .
## .
## .
Now we have annotated GRanges for each assay. The eQTL data in part are:
## [1] "gene_id" "variant_id" "tss_distance" "ma_samples"
## [5] "ma_count" "maf" "pval_nominal" "slope"
## [9] "slope_se" "qvalue" "chr" "snp_pos"
## [13] "A1" "A2" "build" "origin"
## GRanges object with 6 ranges and 4 metadata columns:
## seqnames ranges strand | gene_id variant_id
## <Rle> <IRanges> <Rle> | <character> <character>
## [1] 17 38061054 * | ENSG00000266469.1 17_38061054_G_A_b37
## [2] 17 38061439 * | ENSG00000161395.8 17_38061439_T_C_b37
## [3] 17 38061439 * | ENSG00000073605.14 17_38061439_T_C_b37
## [4] 17 38061439 * | ENSG00000172057.5 17_38061439_T_C_b37
## [5] 17 38061439 * | ENSG00000167914.6 17_38061439_T_C_b37
## [6] 17 38062196 * | ENSG00000161395.8 17_38062196_G_A_b37
## maf pval_nominal
## <numeric> <numeric>
## [1] 0.0195822 0.000772192
## [2] 0.420366 0.000399212
## [3] 0.420366 6.87714e-10
## [4] 0.420366 1.08337e-10
## [5] 0.420366 2.15704e-10
## [6] 0.418848 0.000309568
## -------
## seqinfo: 1 sequence from hg19 genome
The names of genes and variants used here are cumbersome – symbols and rsids are preferable.
addsyms = function(x, EnsDb=EnsDb.Hsapiens.v75::EnsDb.Hsapiens.v75) {
ensids = gsub("\\..*", "", x$gene_id) # remove post period
gns = genes(EnsDb)
x$symbol = gns[ensids]$symbol
x
}
s1 = addsyms(s1)
Note that it is possible to retrieve rsids for the SNPs by address. But this is a slow operation involving a huge SNPlocs package that we do not want to work with directly for this vignette.
> snpsByOverlaps(SNPlocs.Hsapiens.dbSNP144.GRCh37, s1b)
UnstitchedGPos object with 265 positions and 2 metadata columns:
seqnames pos strand | RefSNP_id alleles_as_ambig
<Rle> <integer> <Rle> | <character> <character>
[1] 17 38061054 * | rs36049276 R
[2] 17 38061439 * | rs4795399 Y
[3] 17 38062196 * | rs2305480 R
[4] 17 38062217 * | rs2305479 Y
[5] 17 38062503 * | rs35104165 Y
... ... ... ... . ... ...
[261] 17 38149258 * | rs58212353 K
[262] 17 38149350 * | rs8073254 V
[263] 17 38149411 * | rs34648856 R
[264] 17 38149724 * | rs3785549 Y
[265] 17 38149727 * | rs3785550 H
-------
seqinfo: 25 sequences (1 circular) from GRCh37.p13 genome
The object s1
computed above is available as
demo_eQTL_granges
. We convert it to a graph via
##
## Attaching package: 'graph'
## The following object is masked from 'package:Biostrings':
##
## complement
## A graphNEL graph with directed edges
## Number of Nodes = 312
## Number of Edges = 693
Nodes are SNPs and genes, edges are present when the resource (in this case the GTEx lung study) declares an association (in this case, an FDR for SNP-gene association not exceeding 0.10.) The graph library includes functions for creation of incidence matrices from graphs, and vice versa.
Given the GRanges representations for sbov
results,
we can use overlap computations to conveniently
identify relationships between eQTL SNPs, genes,
and hypersensitivity or footprint regions.
We use sbov_output_HS
as a persistent instance of
s2
computed above.
seqlevelsStyle(demo_eQTL_granges) = "UCSC"
fo1 = findOverlaps(demo_eQTL_granges, sbov_output_HS)
fo1
## Hits object with 11 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 205 2
## [2] 206 2
## [3] 207 2
## [4] 458 9
## [5] 459 9
## [6] 460 9
## [7] 461 9
## [8] 462 9
## [9] 463 9
## [10] 464 9
## [11] 465 9
## -------
## queryLength: 693 / subjectLength: 11
## GRangesList object of length 2:
## $`2`
## GRanges object with 3 ranges and 17 metadata columns:
## seqnames ranges strand | gene_id variant_id
## <Rle> <IRanges> <Rle> | <factor> <character>
## [1] chr17 38085385 * | ENSG00000073605.14 17_38085385_A_C_b37
## [2] chr17 38085385 * | ENSG00000172057.5 17_38085385_A_C_b37
## [3] chr17 38085385 * | ENSG00000264968.1 17_38085385_A_C_b37
## tss_distance ma_samples ma_count maf pval_nominal slope
## <integer> <integer> <integer> <numeric> <numeric> <numeric>
## [1] 10482 172 207 0.270235 3.7188e-08 0.161276
## [2] 1531 172 207 0.270235 3.07153e-09 0.193448
## [3] 1390 172 207 0.270235 0.0004948 -0.230682
## slope_se qvalue chr snp_pos A1 A2
## <numeric> <numeric> <integer> <integer> <factor> <factor>
## [1] 0.0285803 9.35454706201541e-06 17 38085385 A C
## [2] 0.0317052 9.35523646756825e-07 17 38085385 A C
## [3] 0.065533 0.0400268770769804 17 38085385 A C
## build origin symbol
## <factor> <character> <character>
## [1] b37 Lung_allpairs_v7_eQTL GSDMB
## [2] b37 Lung_allpairs_v7_eQTL ORMDL3
## [3] b37 Lung_allpairs_v7_eQTL RP11-387H17.4
## -------
## seqinfo: 1 sequence from hg19 genome
##
## $`9`
## GRanges object with 8 ranges and 17 metadata columns:
## seqnames ranges strand | gene_id variant_id
## <Rle> <IRanges> <Rle> | <factor> <character>
## [1] chr17 38115299 * | ENSG00000167914.6 17_38115299_C_T_b37
## [2] chr17 38115299 * | ENSG00000188895.7 17_38115299_C_T_b37
## [3] chr17 38115429 * | ENSG00000073605.14 17_38115429_C_CTG_b37
## [4] chr17 38115429 * | ENSG00000172057.5 17_38115429_C_CTG_b37
## [5] chr17 38115429 * | ENSG00000167914.6 17_38115429_C_CTG_b37
## [6] chr17 38115430 * | ENSG00000073605.14 17_38115430_A_C_b37
## [7] chr17 38115430 * | ENSG00000172057.5 17_38115430_A_C_b37
## [8] chr17 38115430 * | ENSG00000167914.6 17_38115430_A_C_b37
## tss_distance ma_samples ma_count maf pval_nominal slope
## <integer> <integer> <integer> <numeric> <numeric> <numeric>
## [1] -3927 59 62 0.0809399 1.73014e-06 -0.530695
## [2] -163252 59 62 0.0809399 0.000512527 -0.197247
## [3] 40526 267 350 0.456919 3.30451e-06 0.126418
## [4] 31575 267 350 0.456919 6.33228e-06 0.137325
## [5] -3797 267 350 0.456919 1.04416e-17 -0.49968
## [6] 40527 267 350 0.456919 3.30451e-06 0.126418
## [7] 31576 267 350 0.456919 6.33228e-06 0.137325
## [8] -3796 267 350 0.456919 1.04416e-17 -0.49968
## slope_se qvalue chr snp_pos A1 A2
## <numeric> <numeric> <integer> <integer> <factor> <factor>
## [1] 0.108872 0.00030774973449386 17 38115299 C T
## [2] 0.0561895 0.0411569937076073 17 38115299 C T
## [3] 0.0266958 0.000547360697034689 17 38115429 C CTG
## [4] 0.029902 0.000976300253765482 17 38115429 C CTG
## [5] 0.0549122 8.9303103388208e-15 17 38115429 C CTG
## [6] 0.0266958 0.000547360697034689 17 38115430 A C
## [7] 0.029902 0.000976300253765482 17 38115430 A C
## [8] 0.0549122 8.9303103388208e-15 17 38115430 A C
## build origin symbol
## <factor> <character> <character>
## [1] b37 Lung_allpairs_v7_eQTL GSDMA
## [2] b37 Lung_allpairs_v7_eQTL MSL1
## [3] b37 Lung_allpairs_v7_eQTL GSDMB
## [4] b37 Lung_allpairs_v7_eQTL ORMDL3
## [5] b37 Lung_allpairs_v7_eQTL GSDMA
## [6] b37 Lung_allpairs_v7_eQTL GSDMB
## [7] b37 Lung_allpairs_v7_eQTL ORMDL3
## [8] b37 Lung_allpairs_v7_eQTL GSDMA
## -------
## seqinfo: 1 sequence from hg19 genome
This shows that there are two DHS sites that overlap with SNPs showing eQTL associations with various genes.
For the footprint data, we have:
## Hits object with 4 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 348 44
## [2] 349 44
## [3] 613 101
## [4] 614 101
## -------
## queryLength: 693 / subjectLength: 107
## GRangesList object of length 2:
## $`44`
## GRanges object with 2 ranges and 17 metadata columns:
## seqnames ranges strand | gene_id variant_id
## <Rle> <IRanges> <Rle> | <factor> <character>
## [1] chr17 38109075 * | ENSG00000172057.5 17_38109075_T_C_b37
## [2] chr17 38109075 * | ENSG00000167914.6 17_38109075_T_C_b37
## tss_distance ma_samples ma_count maf pval_nominal slope
## <integer> <integer> <integer> <numeric> <numeric> <numeric>
## [1] 25221 182 203 0.285915 0.000335618 0.135266
## [2] -10151 182 203 0.285915 2.23245e-14 -0.555363
## slope_se qvalue chr snp_pos A1 A2
## <numeric> <numeric> <integer> <integer> <factor> <factor>
## [1] 0.0373061 0.029325773492785 17 38109075 T C
## [2] 0.0693376 1.36746052792526e-11 17 38109075 T C
## build origin symbol
## <factor> <character> <character>
## [1] b37 Lung_allpairs_v7_eQTL ORMDL3
## [2] b37 Lung_allpairs_v7_eQTL GSDMA
## -------
## seqinfo: 1 sequence from hg19 genome
##
## $`101`
## GRanges object with 2 ranges and 17 metadata columns:
## seqnames ranges strand | gene_id variant_id
## <Rle> <IRanges> <Rle> | <factor> <character>
## [1] chr17 38137033 * | ENSG00000008838.13 17_38137033_A_G_b37
## [2] chr17 38137033 * | ENSG00000167914.6 17_38137033_A_G_b37
## tss_distance ma_samples ma_count maf pval_nominal slope
## <integer> <integer> <integer> <numeric> <numeric> <numeric>
## [1] -80435 274 359 0.468668 0.000771652 0.12397
## [2] 17807 274 359 0.468668 5.75457e-33 0.679408
## slope_se qvalue chr snp_pos A1 A2
## <numeric> <numeric> <integer> <integer> <factor> <factor>
## [1] 0.0365067 0.0565349739572534 17 38137033 A G
## [2] 0.0504649 1.32845831960793e-29 17 38137033 A G
## build origin symbol
## <factor> <character> <character>
## [1] b37 Lung_allpairs_v7_eQTL MED24
## [2] b37 Lung_allpairs_v7_eQTL GSDMA
## -------
## seqinfo: 1 sequence from hg19 genome
We have a small number of cloud-resident FIMO search results through the TFutils package.
library(TFutils)
data(demo_fimo_granges)
seqlevelsStyle(demo_eQTL_granges) = "UCSC"
lapply(demo_fimo_granges, lapply, function(x)
subsetByOverlaps(demo_eQTL_granges, x))
## $VDR
## $VDR$`chr17:38070000-38090000`
## GRanges object with 0 ranges and 17 metadata columns:
## seqnames ranges strand | gene_id variant_id tss_distance ma_samples
## <Rle> <IRanges> <Rle> | <factor> <character> <integer> <integer>
## ma_count maf pval_nominal slope slope_se qvalue chr
## <integer> <numeric> <numeric> <numeric> <numeric> <numeric> <integer>
## snp_pos A1 A2 build origin symbol
## <integer> <factor> <factor> <factor> <character> <character>
## -------
## seqinfo: 1 sequence from hg19 genome
##
##
## $POU2F1
## $POU2F1$`chr17:38070000-38090000`
## GRanges object with 8 ranges and 17 metadata columns:
## seqnames ranges strand | gene_id variant_id
## <Rle> <IRanges> <Rle> | <factor> <character>
## [1] chr17 38073968 * | ENSG00000161395.8 17_38073968_G_C_b37
## [2] chr17 38073968 * | ENSG00000073605.14 17_38073968_G_C_b37
## [3] chr17 38073968 * | ENSG00000172057.5 17_38073968_G_C_b37
## [4] chr17 38073968 * | ENSG00000167914.6 17_38073968_G_C_b37
## [5] chr17 38076198 * | ENSG00000161395.8 17_38076198_TATA_T_b37
## [6] chr17 38076198 * | ENSG00000073605.14 17_38076198_TATA_T_b37
## [7] chr17 38076198 * | ENSG00000172057.5 17_38076198_TATA_T_b37
## [8] chr17 38076198 * | ENSG00000167914.6 17_38076198_TATA_T_b37
## tss_distance ma_samples ma_count maf pval_nominal slope
## <integer> <integer> <integer> <numeric> <numeric> <numeric>
## [1] 220918 251 321 0.41906 0.000246542 0.0992119
## [2] -935 251 321 0.41906 6.84037e-10 0.172741
## [3] -9886 251 321 0.41906 4.19548e-11 0.205608
## [4] -45258 251 321 0.41906 4.86746e-10 -0.388867
## [5] 223148 285 378 0.497368 4.6723e-06 0.120394
## [6] 1295 285 378 0.497368 3.28042e-15 0.211642
## [7] -7656 285 378 0.497368 1.62754e-14 0.231067
## [8] -43028 285 378 0.497368 1.64858e-08 -0.346576
## slope_se qvalue chr snp_pos A1 A2
## <numeric> <numeric> <integer> <integer> <factor> <factor>
## [1] 0.0267552 0.0228108733681083 17 38073968 G C
## [2] 0.0271367 2.32091930777634e-07 17 38073968 G C
## [3] 0.0300749 1.71833992847155e-08 17 38073968 G C
## [4] 0.0605305 1.69012834889834e-07 17 38073968 G C
## [5] 0.0258368 0.000745086232487142 17 38076198 TATA T
## [6] 0.0255291 2.19298356188838e-12 17 38076198 TATA T
## [7] 0.0286818 1.01078884855481e-11 17 38076198 TATA T
## [8] 0.059801 4.42486787325189e-06 17 38076198 TATA T
## build origin symbol
## <factor> <character> <character>
## [1] b37 Lung_allpairs_v7_eQTL PGAP3
## [2] b37 Lung_allpairs_v7_eQTL GSDMB
## [3] b37 Lung_allpairs_v7_eQTL ORMDL3
## [4] b37 Lung_allpairs_v7_eQTL GSDMA
## [5] b37 Lung_allpairs_v7_eQTL PGAP3
## [6] b37 Lung_allpairs_v7_eQTL GSDMB
## [7] b37 Lung_allpairs_v7_eQTL ORMDL3
## [8] b37 Lung_allpairs_v7_eQTL GSDMA
## -------
## seqinfo: 1 sequence from hg19 genome