suppressMessages(library(tidyverse))
suppressMessages(library(glue))
PRE = "/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"
## COPY THE DATA AND SLUG FROM THE HEADER
DATA = glue("{PRE}/2022-04-20-gwas-catalog-analysis-2022")
if(!file.exists(DATA)) system(glue("mkdir {DATA}"))
WORK=DATA
## download data from GWAS catalog
## https://www.ebi.ac.uk/gwas/docs/file-downloads
## move data to DATA
DATAFILE = "gwas_catalog_v1.0.2-associations_e105_r2022-04-07.tsv"
#tempodata=("~/Downloads/tempo/gwas_catalog_v1.0.2-associations_e105_r2022-04-07.tsv")
#system(glue("cp {tempodata} {DATA}/"))
gwascatalog = read_tsv(glue("{DATA}/{DATAFILE}"))
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 372752 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (26): FIRST AUTHOR, JOURNAL, LINK, STUDY, DISEASE/TRAIT, INITIAL SAMPLE...
## dbl (10): PUBMEDID, CHR_POS, UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTAN...
## date (2): DATE ADDED TO CATALOG, DATE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(gwascatalog)
## [1] 372752 38
names(gwascatalog)
## [1] "DATE ADDED TO CATALOG" "PUBMEDID"
## [3] "FIRST AUTHOR" "DATE"
## [5] "JOURNAL" "LINK"
## [7] "STUDY" "DISEASE/TRAIT"
## [9] "INITIAL SAMPLE SIZE" "REPLICATION SAMPLE SIZE"
## [11] "REGION" "CHR_ID"
## [13] "CHR_POS" "REPORTED GENE(S)"
## [15] "MAPPED_GENE" "UPSTREAM_GENE_ID"
## [17] "DOWNSTREAM_GENE_ID" "SNP_GENE_IDS"
## [19] "UPSTREAM_GENE_DISTANCE" "DOWNSTREAM_GENE_DISTANCE"
## [21] "STRONGEST SNP-RISK ALLELE" "SNPS"
## [23] "MERGED" "SNP_ID_CURRENT"
## [25] "CONTEXT" "INTERGENIC"
## [27] "RISK ALLELE FREQUENCY" "P-VALUE"
## [29] "PVALUE_MLOG" "P-VALUE (TEXT)"
## [31] "OR or BETA" "95% CI (TEXT)"
## [33] "PLATFORM [SNPS PASSING QC]" "CNV"
## [35] "MAPPED_TRAIT" "MAPPED_TRAIT_URI"
## [37] "STUDY ACCESSION" "GENOTYPING TECHNOLOGY"
## show number of entries in the GWAS catalog with cancer
gwascatalog %>% select(`DISEASE/TRAIT`, MAPPED_GENE) %>% filter(grepl("cancer",`DISEASE/TRAIT`)) %>% dim()
## [1] 7729 2
Number of distinct loci reported in GWAS of various cancers
Number of entries in the GWAS catalog with cancer, unique loci (it’s a reasonable assumption that mapped genes will be a good way to count loci)
gwascatalog %>% select(`DISEASE/TRAIT`, MAPPED_GENE) %>% filter(grepl("cancer",`DISEASE/TRAIT`)) %>% unique() %>% dim()
## [1] 4911 2