suppressMessages(library(tidyverse))
suppressMessages(library(glue))
PRE = "/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"
## COPY THE DATA AND SLUG FROM THE HEADER
DATA = glue("{PRE}/2022-04-20-gwas-catalog-analysis-2022")
if(!file.exists(DATA)) system(glue("mkdir {DATA}"))
WORK=DATA

## download data from GWAS catalog
## https://www.ebi.ac.uk/gwas/docs/file-downloads
## move data to DATA
DATAFILE = "gwas_catalog_v1.0.2-associations_e105_r2022-04-07.tsv"
#tempodata=("~/Downloads/tempo/gwas_catalog_v1.0.2-associations_e105_r2022-04-07.tsv")
#system(glue("cp {tempodata} {DATA}/"))
gwascatalog = read_tsv(glue("{DATA}/{DATAFILE}"))
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 372752 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (26): FIRST AUTHOR, JOURNAL, LINK, STUDY, DISEASE/TRAIT, INITIAL SAMPLE...
## dbl  (10): PUBMEDID, CHR_POS, UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTAN...
## date  (2): DATE ADDED TO CATALOG, DATE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(gwascatalog)
## [1] 372752     38
names(gwascatalog)
##  [1] "DATE ADDED TO CATALOG"      "PUBMEDID"                  
##  [3] "FIRST AUTHOR"               "DATE"                      
##  [5] "JOURNAL"                    "LINK"                      
##  [7] "STUDY"                      "DISEASE/TRAIT"             
##  [9] "INITIAL SAMPLE SIZE"        "REPLICATION SAMPLE SIZE"   
## [11] "REGION"                     "CHR_ID"                    
## [13] "CHR_POS"                    "REPORTED GENE(S)"          
## [15] "MAPPED_GENE"                "UPSTREAM_GENE_ID"          
## [17] "DOWNSTREAM_GENE_ID"         "SNP_GENE_IDS"              
## [19] "UPSTREAM_GENE_DISTANCE"     "DOWNSTREAM_GENE_DISTANCE"  
## [21] "STRONGEST SNP-RISK ALLELE"  "SNPS"                      
## [23] "MERGED"                     "SNP_ID_CURRENT"            
## [25] "CONTEXT"                    "INTERGENIC"                
## [27] "RISK ALLELE FREQUENCY"      "P-VALUE"                   
## [29] "PVALUE_MLOG"                "P-VALUE (TEXT)"            
## [31] "OR or BETA"                 "95% CI (TEXT)"             
## [33] "PLATFORM [SNPS PASSING QC]" "CNV"                       
## [35] "MAPPED_TRAIT"               "MAPPED_TRAIT_URI"          
## [37] "STUDY ACCESSION"            "GENOTYPING TECHNOLOGY"
## show number of entries in the GWAS catalog with cancer
gwascatalog %>% select(`DISEASE/TRAIT`, MAPPED_GENE) %>% filter(grepl("cancer",`DISEASE/TRAIT`)) %>% dim()
## [1] 7729    2

Number of distinct loci reported in GWAS of various cancers

Number of entries in the GWAS catalog with cancer, unique loci (it’s a reasonable assumption that mapped genes will be a good way to count loci)

gwascatalog %>% select(`DISEASE/TRAIT`, MAPPED_GENE) %>% filter(grepl("cancer",`DISEASE/TRAIT`)) %>% unique() %>% dim()
## [1] 4911    2

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The source code is licensed under MIT.

Suggest changes

If you find any mistakes (including typos) or want to suggest changes, please feel free to edit the source file of this page on Github and create a pull request.

Citation

For attribution, please cite this work as

Haky Im (2022). GWAS catalog analysis 2022. BIOS 25328 Cancer Genomics Class Notes. /post/2022/04/20/gwas-catalog-analysis-2022/

BibTeX citation

@misc{
  title = "GWAS catalog analysis 2022",
  author = "Haky Im",
  year = "2022",
  journal = "BIOS 25328 Cancer Genomics Class Notes",
  note = "/post/2022/04/20/gwas-catalog-analysis-2022/"
}