library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.5     ✓ dplyr   1.0.3
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
work.dir ="~/Downloads/hapmap/"
## qqunif function
source("https://gist.githubusercontent.com/hakyim/38431b74c6c0bf90c12f/raw/21fbae9a48dc475f42fa60f0ef5509d071dea873/qqunif")

*What’s the population composition?**

popinfo = read_tsv(paste0(work.dir,"relationships_w_pops_051208.txt"))
popinfo %>% count(population)
samdata = read_tsv(paste0(work.dir,"phase3_corrected.psam"),guess_max = 2500) 
superpop = samdata %>% select(SuperPop,Population) %>% unique()
superpop = rbind(superpop, data.frame(SuperPop=c("EAS","HIS","AFR"),Population=c("CHD","MEX","MKK")))

Effect of population structure in Hardy Weinberg

## what happens if we calculate HWE with this mixed population?
system(glue::glue("~/bin/plink --bfile {work.dir}hapmapch22 --hardy --out {work.dir}output/allhwe"))
allhwe = read.table(glue::glue("{work.dir}output/allhwe.hwe"),header=TRUE,as.is=TRUE)
hist(allhwe$P)
qqunif(allhwe$P,main='HWE HapMap3 All Pop')

What if we calculate with single population?

pop = "CHB"
pop = "CEU"
pop = "YRI"
for(pop in c("CHB","CEU","YRI"))
{
  ## what if we calculate with single population?
  popinfo %>% filter(population==pop) %>%
    write_tsv(path=glue::glue("{work.dir}{pop}.fam") )
  system(glue::glue("~/bin/plink --bfile {work.dir}hapmapch22 --hardy --keep {work.dir}{pop}.fam --out {work.dir}output/hwe-{pop}"))
  pophwe = read.table(glue::glue("{work.dir}output/hwe-{pop}.hwe"),header=TRUE,as.is=TRUE)
  hist(pophwe$P,main=glue::glue("HWE {pop} and founders only"))
  qqunif(pophwe$P,main=glue::glue("HWE {pop} and founders only"))
}
## not much difference when --nonfounders option is added

What if we add non founders? Some of the samples in HapMap were recruited from families.

## what if we add nonfounders?
system(glue::glue("~/bin/plink --bfile {work.dir}hapmapch22 --hardy --keep {work.dir}CEU.fam --nonfounders --out {work.dir}output/CEUhwe_nf"))
CEUhwe_nf = read.table(glue::glue("{work.dir}output/CEUhwe_nf.hwe"),header=TRUE,as.is=TRUE)
hist(CEUhwe_nf$P,main="HWE CEU + non founders")
qqunif(CEUhwe_nf$P,main="HWE CEU + non founders")
qqplot(-log10(CEUhwe_nf$P),-log10(CEUhwe_nf$P),main="all vs founders only" );abline(0,1) 

GWAS on a growth phenotype in HapMap samples

## read igrowth
igrowth = read_tsv("https://raw.githubusercontent.com/hakyimlab/igrowth/master/rawgrowth.txt")
## fix FID from igrowth file
igrowth = popinfo %>% select(-pheno) %>% inner_join(igrowth %>% select(IID,growth), by=c("IID"="IID"))
write_tsv(igrowth,path=glue::glue("{work.dir}igrowth.pheno"))
igrowth %>% ggplot(aes(population,growth)) + geom_violin(aes(fill=population)) + geom_boxplot(width=0.2,col='black',fill='gray',alpha=.8) + theme_bw(base_size = 15)
summary( lm(growth~population,data=igrowth) )
system(glue::glue("~/bin/plink --bfile {work.dir}hapmapch22 --linear --pheno {work.dir}igrowth.pheno --pheno-name growth --maf 0.05 --out {work.dir}output/igrowth"))
igrowth.assoc = read.table(glue::glue("{work.dir}output/igrowth.assoc.linear"),header=T,as.is=T)
hist(igrowth.assoc$P)
qqunif(igrowth.assoc$P)
## install.packages("qqman")
library(qqman)
manhattan(igrowth.assoc, chr="CHR", bp="BP", snp="SNP", p="P" )

Simulate phenotype

set.seed(10) ## to get the same simulated values each time
simpheno = popinfo %>% mutate(pheno=rnorm(nrow(popinfo)))
write_tsv(simpheno, path=glue::glue("{work.dir}sim.pheno"))
## run association with plink
system(glue::glue("~/bin/plink --bfile {work.dir}hapmapch22 --linear --pheno {work.dir}sim.pheno --pheno-name pheno --maf 0.05 --out {work.dir}output/simpheno") )
simpheno.assoc = read.table(glue::glue("{work.dir}output/simpheno.assoc.linear"),header=T,as.is=T)
hist(simpheno.assoc$P)
qqunif(simpheno.assoc$P)
manhattan(simpheno.assoc, chr="CHR", bp="BP", snp="SNP", p="P" )

PCA calculation using plink

## generate PCs using plink
system(glue::glue("~/bin/plink --bfile {work.dir}hapmapch22 --pca --out {work.dir}output/pca"))
## read plink calculated PCs
pcplink = read.table(glue::glue("{work.dir}output/pca.eigenvec"),header=F, as.is=T)
names(pcplink) = c("FID","IID",paste0("PC", c(1:(ncol(pcplink)-2))) )
pcplink = popinfo %>% left_join(superpop,by=c("population"="Population")) %>% inner_join(pcplink, by=c("FID"="FID", "IID"="IID"))
## plot PC1 vs PC2
pcplink %>% ggplot(aes(PC1,PC2,col=population,shape=SuperPop)) + geom_point(size=3,alpha=.7) + theme_bw(base_size = 15)
manhattan(simpheno.assoc, chr="CHR", bp="BP", snp="SNP", p="P" ,main="Simulated Phenotype")

runnig igrowth GWAS using PCs

system(glue::glue("~/bin/plink --bfile {work.dir}hapmapch22 --linear --pheno {work.dir}igrowth.pheno --pheno-name growth --covar {work.dir}output/pca.eigenvec --covar-number 1-4 --hide-covar --maf 0.05 --out {work.dir}output/igrowth-adjPC"))
igrowth.adjusted.assoc = read.table(glue::glue("{work.dir}output/igrowth-adjPC.assoc.linear"),header=T,as.is=T)
##indadd = igrowth.adjusted.assoc$TEST=="ADD"
titulo = "igrowh association adjusted for PCs"
hist(igrowth.adjusted.assoc$P,main=titulo)
qqunif(igrowth.adjusted.assoc$P,main=titulo)

calculate Zscores

genomic.control = function(pvec)
{
 z = qnorm(pvec / 2)
 lambda = round(median(z^2) / 0.4549, 3)
 lambda
}

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The source code is licensed under MIT.

Suggest changes

If you find any mistakes (including typos) or want to suggest changes, please feel free to edit the source file of this page on Github and create a pull request.

Citation

For attribution, please cite this work as

R package build (2021). L6-population-structure. BIOS 25328 Cancer Genomics Class Notes. /post/2021/02/18/l6-population-structure/

BibTeX citation

@misc{
  title = "L6-population-structure",
  author = "R package build",
  year = "2021",
  journal = "BIOS 25328 Cancer Genomics Class Notes",
  note = "/post/2021/02/18/l6-population-structure/"
}