Association testing (GWAS)#

[1]:
%load_ext lab_black
import matplotlib.pyplot as plt
import admix
import numpy as np

np.random.seed(1234)
[2]:
dset = admix.io.read_dataset("example_data/CEU-YRI")
[3]:
dset
[3]:
admix.Dataset object with n_snp x n_indiv = 15357 x 10000, n_anc=2
        snp: 'CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO'
[4]:
af_per_anc = dset.af_per_anc()  # calculate allele frequency by ancestry
snp_list = dset.snp.index[np.all((0.01 < af_per_anc) & (af_per_anc < 0.99), axis=1)][
    0:1000
]
dset = dset[snp_list, :]  # filter by allele frequency between 1%-99%.

# simulate binary phenotypes
sim = admix.simulate.quant_pheno(dset, n_causal=2, hsq=0.01, cor=1.0)
beta, pheno_g, pheno = sim["beta"], sim["pheno_g"], sim["pheno"]
print(beta.shape)  # (n_snp, n_anc, n_sim)
print(pheno_g.shape)  # (n_indiv, n_sim)
print(pheno.shape)  # (n_indiv, n_sim)
admix.data.af_per_anc: 100%|███████████████████████████████████████████████████████████████████████████████| 15/15 [00:10<00:00,  1.38it/s]
admix.simulate.quant_pheno: 100%|████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.72it/s]
(1000, 2, 10)
(10000, 10)
(10000, 10)

[5]:
dset
[5]:
admix.Dataset object with n_snp x n_indiv = 1000 x 10000, n_anc=2
        snp: 'CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO'
[6]:
sim_i = 0
sim_pheno = pheno[:, sim_i]
sim_beta = beta[:, :, sim_i]
assoc_df = admix.assoc.marginal(
    dset=dset,
    pheno=sim_pheno,
    method="ATT",
    family="linear",
)
admix.assoc.marginal: 100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.32it/s]
[7]:
assoc_df
[7]:
G_BETA G_SE N P
snp
22:16406147:A:G -0.020341 0.028045 10000 4.682764e-01
22:16551808:T:C -0.015835 0.014218 10000 2.654258e-01
22:16573830:T:C -0.019458 0.030027 10000 5.169898e-01
22:16575525:T:C -0.005990 0.014806 10000 6.857933e-01
22:16576248:G:T -0.017495 0.017168 10000 3.081942e-01
... ... ... ... ...
22:19743069:G:A 0.092459 0.016553 10000 2.388196e-08
22:19746288:G:A -0.019776 0.019905 10000 3.204777e-01
22:19748331:T:C -0.022343 0.014272 10000 1.174854e-01
22:19748480:T:C -0.015623 0.014340 10000 2.759813e-01
22:19750087:T:C -0.031673 0.020432 10000 1.211368e-01

1000 rows × 4 columns

[8]:
fig, axes = plt.subplots(nrows=2, figsize=(5, 3), dpi=150, sharex=True)
axes[0].scatter(np.arange(dset.n_snp), sim_beta[:, 0], s=2)
axes[0].set_ylabel(r"$\beta$")

admix.plot.manhattan(assoc_df.P, ax=axes[1], s=2)
plt.tight_layout()
plt.show()
../_images/notebooks_assoc_8_0.png
[9]:
# pick one simulation
pval_dict = dict()
for method in ["TRACTOR", "ATT", "ASE"]:
    pval_dict[method] = admix.assoc.marginal(
        dset=dset,
        pheno=sim_pheno,
        family="linear",
        method=method,
    ).P
admix.assoc.marginal: 100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.22s/it]
admix.assoc.marginal: 100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.30it/s]
admix.assoc.marginal: 100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.50s/it]
[10]:
fig, axes = plt.subplots(figsize=(4, 2), dpi=150, ncols=2)
admix.plot.compare_pval(
    pval_dict["ATT"], pval_dict["TRACTOR"], xlabel="ATT", ylabel="Tractor", ax=axes[0]
)
admix.plot.compare_pval(
    pval_dict["ATT"], pval_dict["ASE"], xlabel="ATT", ylabel="ASE", ax=axes[1]
)
fig.tight_layout()
../_images/notebooks_assoc_10_0.png

HET test#

HET tests the null hypothesis for coefficients for a set of variables are identical. In the case of two-way admixed populations. It tests the null of \(H_0: \beta_{s,1} = \beta_{s,2}\) for each SNP \(s\).

In the following, although we simulate phenotype with no heterogeneity by ancestry, we still get an inflated p-value distribution. This is because the LD induce partial heterogeneity in the effect sizes.

Currently HET test is not implemented with cpp so one can expect slower performance. It is recommended to perform HET test in a small subset of variants after LD clumping.

# <out>.clumped will be created
plink \
    --bfile <bfile> \
    --clump <out>.assoc \
    --clump-p1 5e-8 \
    --clump-p2 1e-4 \
    --clump-r2 0.1 \
    --clump-kb 10000 \
    --memory 20000 \
    --out <out>

# get list of clumped SNPs
awk '(NF > 0) && (NR > 1) {print $3 }' <out>.clumped > <out>.clumped.snp_list
[11]:
het_df = admix.assoc.marginal(
    dset=dset,
    pheno=sim_pheno,
    family="linear",
    method="HET",
)
admix.assoc.marginal:   0%|                                                                                          | 0/2 [00:00<?, ?it/s]
2024-01-27 20:05:42 [info     ] Currently HET test is implemented through statsmodels, which can be slow. Pass in small amount of data whenever possible.
admix.assoc.marginal:  50%|█████████████████████████████████████████                                         | 1/2 [00:04<00:04,  4.67s/it]
2024-01-27 20:05:47 [info     ] Currently HET test is implemented through statsmodels, which can be slow. Pass in small amount of data whenever possible.
admix.assoc.marginal: 100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.11s/it]
[12]:
fig, ax = plt.subplots(figsize=(2, 2), dpi=150)
admix.plot.qq(het_df.P.values, bootstrap_ci=True, ax=ax)
plt.show()
lambda GC: 2.22 [1.85, 2.54]
../_images/notebooks_assoc_13_1.png