Goal

-[ ] download Personal Genome Project data (http://repgp.teamerlich.org/)

-[ ] analyze content, explore structure

-[ ] predict height and other phenotypes with PRS and compare to reported values

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.0     ✓ dplyr   1.0.3
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(RSQLite)
sqlite <- dbDriver("SQLite")
DATA="/Users/haekyungim/Box/LargeFiles/imlab-data/data-Github/web-data/2021-04-21-personal-genomes-project-data/"
WORK=DATA

Define environmental variables for convenience

DATA="/Users/haekyungim/Box/LargeFiles/imlab-data/data-Github/web-data/2021-04-21-personal-genomes-project-data/"
WORK=$DATA
PRSice=/Users/haekyungim/bin/PRSice_mac
BIN="/Users/haekyungim/bin"
## download sqqtl database with phenotypes?
cd $DATA
wget http://files.teamerlich.org/repgp/repgp-data.sqlite3 
wget http://files.teamerlich.org/repgp/repgp.vcf.gz
dbname <- paste0(DATA,"repgp-data.sqlite3") ## add full path if db file not in current directory
## connect to db
db = dbConnect(sqlite,dbname)
## list tables
dbListTables(db)
## [1] "ancestry_jkp"   "ancestry_pop"   "ancestry_supop" "users"
dbListFields(db, "users") 
## [1] "id"       "sample"   "height"   "weight"   "human_id" "file_id"  "blood"   
## [8] "gender"   "race"
## convenience query function
query <- function(...) dbGetQuery(db, ...)
users = query("select * from users")
names(users)
## [1] "id"       "sample"   "height"   "weight"   "human_id" "file_id"  "blood"   
## [8] "gender"   "race"
users %>% count(race)
##                               race   n
## 1                                *  68
## 2 American Indian or Alaska Native   2
## 3                            Asian   6
## 4        Black or African American   1
## 5                Caucasian (White)   1
## 6               Hispanic or Latino   2
## 7                  Hispanic/Latino   3
## 8                            White 167
users  %>% count(gender)
##   gender   n
## 1      *  69
## 2 Female  46
## 3   Male 135
users %>% count(blood)
##   blood   n
## 1     * 111
## 2    A-  10
## 3    A+  41
## 4   AB-   1
## 5   AB+   4
## 6    B-   4
## 7    B+  16
## 8    O-  11
## 9    O+  52
users %>% ggplot(aes(height,fill=gender)) + geom_density(alpha=0.6) + ggtitle("Height by gender - Missing gender, *, has bi-modal distr.")

Download imputed genotype files

cd $DATA
for num in $(seq 22);
do
 wget http://files.teamerlich.org/repgp/repgp-imputed.chr${num}.vcf.gz
done

Create plink formatted genotype files. You will need plink2, which you can download from here

## install plink2 from https://www.cog-genomics.org/plink/2.0/
## more plink2 to ~/bin/
## ~/bin/plink2 --vcf repgp-imputed.chr22.vcf.gz    --make-bed --out repgp
cd $DATA
for num in $(seq 22);
do
~/bin/plink2 --vcf repgp-imputed.chr${num}.vcf.gz   --make-bed --out repgp.chr${num}
done

Choose a trait and download its GWAS summary statistics from this list https://uchicago.box.com/v/test-gtex-gwas-ukb-data

## here I chose sleep duration
cd $DATA
##wget https://uchicago.box.com/s/4gc4u1yr579qjbz8s6d27h8vb0h1yw1q
wget -O ukb_sleep_duration.gz https://uchicago.box.com/shared/static/4gc4u1yr579qjbz8s6d27h8vb0h1yw1q.gz
## height
wget -O ukb_height.gz https://uchicago.box.com/shared/static/2889tf5fx5d5ulll7otg4dltd1vogabh.gz

Create phenotype file

fam = read_tsv(glue::glue("{DATA}/repgp.fam"),col_names = FALSE)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_double(),
##   X2 = col_character(),
##   X3 = col_double(),
##   X4 = col_double(),
##   X5 = col_double(),
##   X6 = col_double()
## )
names(fam)[1:2] = c("FID","IID")
fam <- fam %>% select(FID, IID) %>% inner_join(users %>% select(sample,height,weight,gender), by=c("IID"="sample")) 
write_tsv(fam,file=glue::glue("{DATA}/phenodata.txt"))
## use --pheno-col to specify phenotype list
mkdir $WORK/output
Rscript $PRSice/PRSice.R --dir $PRSice \
    --prsice $PRSice/PRSice_mac \
    --base $WORK/ukb_height.gz \
    --target $WORK/repgp.chr# \
    --snp variant_id \
    --A1 effect_allele \
    --A2 non_effect_allele \
    --stat effect_size \
    --beta \
    --pvalue pvalue \
    --pheno-file $WORK/phenodata.txt \
    --pheno-col height \
    --bar-levels 5e-08,5e-07,5e-06,5e-05,5e-04,5e-03,5e-02,5e-01,1 \
    --fastscore \
    --binary-target F \
    --thread 2 \
    --out $WORK/output/height_score_all

Got the following error. Some SNP IDs are duplicated.

Error: A total of 14 duplicated SNP ID detected out of 
       6417846 input SNPs! Valid SNP ID (post --extract / 
       --exclude, non-duplicated SNPs) stored at 
       /Users/haekyungim/Box/LargeFiles/imlab-data/data-Github/web-data/2021-04-21-personal-genomes-project-data//output/height_score_all.valid. 
       You can avoid this error by using --extract 
       /Users/haekyungim/Box/LargeFiles/imlab-data/data-Github/web-data/2021-04-21-personal-genomes-project-data//output/height_score_all.valid 

Try again including only the SNP list created by the previous run of PRSice, height_score_all.valid A1 is the effect allele in PRSice, similar to Plink’s default. Other software use different conventions so this sign is something that need to be checked carefully.

Rscript $PRSice/PRSice.R --dir $PRSice \
    --prsice $PRSice/PRSice_mac \
    --base $WORK/ukb_height.gz \
    --target $WORK/repgp.chr# \
    --snp variant_id \
    --A1 effect_allele \
    --A2 non_effect_allele \
    --stat effect_size \
    --beta \
    --pvalue pvalue \
    --pheno-file $WORK/phenodata.txt \
    --pheno-col height \
    --bar-levels 5e-08,5e-07,5e-06,5e-05,5e-04,5e-03,5e-02,5e-01,1 \
    --fastscore \
    --binary-target F \
    --thread 2 \
    --extract $WORK/output/height_score_all.valid \
    --out $WORK/output/height_score_all

This time it worked. This figure shows the correlation (squared) between prediction and actual values provided in the phenodata.txt file. In this dataset, using SNPs with p-values up to 0.005 yielded the most significant correlation. Next, we will plot the predicted and actual values to get a better sense of how well PRS is predicting.

Plot predicted vs actual height

## we already have the phenotype data in the fam variable
predicted_height = read_delim(glue::glue("{DATA}/output/height_score_all.best"),delim=" ")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   FID = col_double(),
##   IID = col_character(),
##   In_Regression = col_character(),
##   PRS = col_double()
## )
combined <- predicted_height %>% inner_join(fam,by=c("IID"="IID")) 

combined %>% ggplot(aes(PRS,height)) + geom_point()

summary(lm(height ~ PRS,data=combined)) %>% coef() 
##                Estimate   Std. Error    t value      Pr(>|t|)
## (Intercept)    172.6716 6.075067e-01 284.230021 1.916175e-259
## PRS         159197.1399 2.478220e+04   6.423849  9.758137e-10

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The source code is licensed under MIT.

Suggest changes

If you find any mistakes (including typos) or want to suggest changes, please feel free to edit the source file of this page on Github and create a pull request.

Citation

For attribution, please cite this work as

Haky Im (2021). Personal Genomes Project Data. BIOS 25328 Cancer Genomics Class Notes. /post/2021/04/21/personal-genomes-project-data/

BibTeX citation

@misc{
  title = "Personal Genomes Project Data",
  author = "Haky Im",
  year = "2021",
  journal = "BIOS 25328 Cancer Genomics Class Notes",
  note = "/post/2021/04/21/personal-genomes-project-data/"
}