import admix
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import List, Dict
import glob
from natsort import natsorted
import os
import json
from scipy import stats
from ._utils import log_params
from scipy.interpolate import CubicSpline
def _write_admix_grm(
dict_grm: Dict[str, np.ndarray],
df_weight: pd.Series,
df_id: pd.DataFrame,
out_prefix: str,
):
"""write admix grm
Parameters
----------
dict_grm : Dict[str, np.ndarray]
name -> grm matrix
df_weight : pd.Series
snp weights
n_snp : int
number of snps
df_id: pd.DataFrame
list of individuals
out_prefix: str
prefix of the output file
Returns
-------
None
"""
for name in dict_grm:
grm = dict_grm[name]
admix.tools.gcta.write_grm(
out_prefix + "." + name,
K=grm,
df_id=df_id,
n_snps=np.repeat(len(df_weight), len(df_id)),
)
# write weight
df_weight.to_csv(out_prefix + ".weight.tsv", sep="\t")
# Implementing genetic correlation related functions
[docs]def admix_grm(
pfile: str,
out_prefix: str,
maf_cutoff: float = 0.005,
her_model="mafukb",
freq_cols=["LANC_FREQ1", "LANC_FREQ2"],
snp_chunk_size: int = 256,
snp_list: str = None,
write_raw: bool = False,
) -> None:
"""
Calculate the admix GRM for a given pfile
Parameters
----------
pfile : str
Path to the pfile
out_prefix : str
Prefix of the output files
maf_cutoff : float, optional
MAF cutoff for the admixed individuals, by default 0.005
her_model : str, optional
Heritability model, by default "mafukb"
one of "uniform", "gcta", "ldak", "mafukb"
freq_cols : List[str], optional
Columns of the pfile to use as frequency, by default ["LANC_FREQ1", "LANC_FREQ2"]
to perform the ancestry-specific MAF cutoffs
snp_chunk_size : int, optional
Number of SNPs to read at a time, by default 256
This can be tuned to reduce memory usage
snp_list : str, optional
Path to a file containing a list of SNPs to use. Each line should be a SNP ID.
Only SNPs in the list will be used for the analysis. By default None
write_raw: bool, optional
Whether to write the raw GRM, G1, G2, G12, by default False
Returns
-------
GRM files: {out_prefix}.[K1, K2].[grm.bin | grm.id | grm.n] will be generated
Weight file: {out_prefix}.weight.tsv will be generated
"""
log_params("admix-grm", locals())
assert len(freq_cols) == 2, "freq_cols must be a list of length 2"
dset = admix.io.read_dataset(pfile=pfile, snp_chunk=snp_chunk_size)
# filter for SNPs
if snp_list is not None:
with open(snp_list, "r") as f:
filter_snp_list = [line.strip() for line in f]
n_filter_snp = len(filter_snp_list)
filter_snp_list = dset.snp.index[dset.snp.index.isin(filter_snp_list)]
if len(filter_snp_list) < n_filter_snp:
admix.logger.warning(
f"{n_filter_snp - len(filter_snp_list)} SNPs in {snp_list} are not in the dataset"
)
dset = dset[filter_snp_list]
snp_subset = np.where(
dset.snp[freq_cols[0]].between(maf_cutoff, 1 - maf_cutoff)
& dset.snp[freq_cols[1]].between(maf_cutoff, 1 - maf_cutoff)
)[0]
dset = dset[snp_subset]
dset.snp["PRIOR_VAR"] = admix.data.calc_snp_prior_var(dset.snp, her_model=her_model)
K1, K2 = admix.data.admix_grm_equal_var(
geno=dset.geno,
lanc=dset.lanc,
snp_prior_var=dset.snp.PRIOR_VAR.values,
n_anc=dset.n_anc,
)
df_weight = dset.snp.PRIOR_VAR
df_id = pd.DataFrame({"0": dset.indiv.index.values, "1": dset.indiv.index.values})
_write_admix_grm(
dict_grm={"K1": K1, "K2": K2},
df_weight=df_weight,
df_id=df_id,
out_prefix=out_prefix,
)
if write_raw:
G1, G2, G12 = admix.data.admix_grm(
geno=dset.geno,
lanc=dset.lanc,
snp_prior_var=dset.snp.PRIOR_VAR.values,
)
for suffix, mat in zip(["G1", "G2", "G12"], [G1, G2, G12]):
admix.tools.gcta.write_grm(
f"{out_prefix}.{suffix}",
K=mat,
df_id=df_id,
n_snps=np.repeat(len(df_weight), len(df_id)),
)
[docs]def admix_grm_merge(prefix: str, out_prefix: str, n_part: int = 22) -> None:
"""
Merge multiple GRM matrices
Parameters
----------
prefix : str
Prefix of the GRM files, any files with the pattern of <prefix>.*
will be merged
out_prefix : str
Prefix of the output file
n_part : int, optional
Number of partitions, by default 22
Returns
-------
GRM files: {out_prefix}.[K1, K2].[grm.bin | grm.id | grm.n] will be generated
Weight file: {out_prefix}.weight.tsv will be generated
"""
log_params("admix-grm-merge", locals())
# search for files with the pattern of <prefix>.<suffix>.K1.grm.bin and <prefix>.<suffix>.K2.grm.bin
K1_grm_prefix_list = [
f[: -len(".K1.grm.bin")] for f in sorted(glob.glob(f"{prefix}*.K1.grm.bin"))
]
K2_grm_prefix_list = [
f[: -len(".K2.grm.bin")] for f in sorted(glob.glob(f"{prefix}*.K2.grm.bin"))
]
assert len(K1_grm_prefix_list) == len(K2_grm_prefix_list)
assert K1_grm_prefix_list == K2_grm_prefix_list, (
"GRM files .K1 and .K2 are not matched, "
f"K1_grm_list={K1_grm_prefix_list}, K2_grm_list={K2_grm_prefix_list}"
)
grm_prefix_list = natsorted(K1_grm_prefix_list)
if len(grm_prefix_list) != n_part:
raise ValueError(
f"Number of GRM files ({len(grm_prefix_list)}) is not equal to n_part ({n_part})"
)
admix.logger.info(f"{len(grm_prefix_list)} GRM files to be merged: {grm_prefix_list}")
prior_var_list = []
for grm_prefix in grm_prefix_list:
prior_var_list.append(pd.read_csv(grm_prefix + ".weight.tsv", sep="\t", index_col=0))
def _merge(suffix):
total_grm = 0
total_df_id = None
weight_list = []
for i, grm_prefix in enumerate(grm_prefix_list):
prior_var = prior_var_list[i]
weight = prior_var["PRIOR_VAR"].sum()
weight_list.append(weight)
grm, df_id, n_snps = admix.tools.gcta.read_grm(f"{grm_prefix}.{suffix}")
if total_df_id is None:
total_df_id = df_id
else:
assert np.all(
total_df_id == df_id
), f"df_id of {grm_prefix} is not matched does not match with already loaded ones"
total_grm += grm * weight
assert np.all(n_snps == prior_var.shape[0])
total_grm /= np.sum(weight_list)
return total_grm, df_id
K1, df_id1 = _merge("K1")
K2, df_id2 = _merge("K2")
assert df_id1.equals(df_id2)
df_id = df_id1
df_weight = pd.concat(prior_var_list)
_write_admix_grm(
dict_grm={"K1": K1, "K2": K2},
df_weight=df_weight,
df_id=df_id,
out_prefix=out_prefix,
)
[docs]def genet_cor(
pheno: str,
grm_prefix: str,
out_dir: str,
rg_grid=np.linspace(0, 1.0, 21),
quantile_normalize: bool = True,
n_thread: int = 2,
clean: bool = True,
):
"""Estimate genetic correlation
Parameters
----------
pheno : str
phenotype file, the 1st column contains ID, 2nd column contains phenotype, and
the rest of columns are covariates.
grm_prefix : str
folder containing K1, K2 GRM files
out_dir : str
folder to store the output files
rg_grid : list, optional
List of rg values to grid search, by default np.linspace(0, 1.0, 21)
quantile_normalize: bool
whether to perform quantile normalization for both phenotype and each column of covariates
n_thread : int, optional
number of threads, by default 2
"""
log_params("genet-cor", locals())
## compile phenotype and covariates
df_pheno = pd.read_csv(pheno, sep="\t", index_col=0)
df_pheno.index = df_pheno.index.astype(str)
# subset for individuals with non-nan value in df_trait
trait_col = df_pheno.columns[0]
covar_cols = df_pheno.columns[1:]
# filter out individuals with missing phenotype
df_pheno = df_pheno[df_pheno[trait_col].notna()]
df_trait = df_pheno[[trait_col]].copy()
df_covar = df_pheno[covar_cols].copy()
df_covar = admix.data.convert_dummy(df_covar)
if quantile_normalize:
# perform quantile normalization
for col in df_trait.columns:
df_trait[col] = admix.data.quantile_normalize(df_trait[col])
for col in df_covar.columns:
df_covar[col] = admix.data.quantile_normalize(df_covar[col])
# fill na with column mean
df_covar.fillna(df_covar.mean(), inplace=True)
df_id = pd.DataFrame(
{"FID": df_trait.index.values, "IID": df_trait.index.values},
index=df_trait.index.values,
)
df_trait = pd.merge(df_id, df_trait, left_index=True, right_index=True)
df_covar = pd.merge(df_id, df_covar, left_index=True, right_index=True)
## load grm
K1, df_id1, n_snps1 = admix.tools.gcta.read_grm(grm_prefix + ".K1")
K2, df_id2, n_snps2 = admix.tools.gcta.read_grm(grm_prefix + ".K2")
assert df_id1.equals(df_id2)
assert np.allclose(n_snps1, n_snps2)
df_id = df_id1
n_snps = n_snps1
os.makedirs(out_dir, exist_ok=True)
for rg in tqdm(rg_grid):
K = K1 + K2 * rg
grm = os.path.join(out_dir, f"rg{int(rg * 100)}")
admix.tools.gcta.write_grm(
grm,
K=K,
df_id=df_id,
n_snps=n_snps,
)
admix.tools.gcta.reml(
grm_path=grm,
df_pheno=df_trait,
df_covar=df_covar,
out_prefix=os.path.join(out_dir, f"rg{int(rg * 100)}"),
n_thread=n_thread,
est_fix=True,
)
if clean:
# remove <grm>.grm.* files
for f in glob.glob(grm + ".grm.*"):
os.remove(f)
def summarize_genet_cor(
est_dir: str,
out_prefix: str,
weight_file: str = None,
freq_file: str = None,
scale_factor: float = None,
freq_col: str = "FREQ",
index_col: str = "snp",
rg_str: str = "rg",
):
"""Summarize the results of genetic correlation analysis.
Parameters
----------
est_dir : str
Estimation directory, containing rho<rho>.hsq, rho<rho>.log
out_prefix : str
output prefix
weight_file: str
weight_file specifying the prior variance file (<grm_prefix>.weight.tsv),
freq_file: str
frequency file (dataset *.snp_info files)
scale_factor: float
rather calculating the scale factor from `weight_file` and `freq_file` from
scratch, specify the scale factor. This scale factor be pre-computed from
admix.tools.gcta.calculate_hsq_scale
freq_col: str
column name for frequency in freq_file
index_col: str
column name for index in freq_file
rg_str : str
string name for rg, by default "rg", or "rho" (for legacy)
Returns
-------
Log-likelihood curve for different rho: <out_prefix>.loglkl.txt
Summarization file: <out_prefix>.summary.json. This file contains
- poterior mode
- highest posterior density interval (50% / 95%)
- heritability (if grm_prefix is provided)
Notes
-----
If `weight_file` and `freq_file` are provided, heritability at rg = 1
(using rho100.hsq) will be estimated.
"""
log_params("summarize-genet-cor", locals())
rg_list = np.array(
sorted(
[
int(os.path.basename(p).split(".")[0][len(rg_str) :])
for p in glob.glob(os.path.join(est_dir, f"{rg_str}*.hsq"))
]
)
)
admix.logger.info(f"rg={rg_list}")
# read log-likelihood curve
n_indiv = None
loglkl_list = []
for rg in rg_list:
dict_reml = admix.tools.gcta.read_reml(os.path.join(est_dir, f"{rg_str}{rg}"))
if n_indiv is None:
n_indiv = dict_reml["n"]
else:
assert (
n_indiv == dict_reml["n"]
), f"n_indiv={dict_reml['n']} from r={rg} different from previous one {n_indiv}"
loglkl_list.append(dict_reml["loglik"])
# interpolate
rg_list = rg_list / 100
# write raw estimation file
pd.DataFrame({"rg": rg_list, "loglkl": loglkl_list}).to_csv(
out_prefix + ".loglkl.txt", sep="\t", index=False
)
admix.logger.info(f"Log-likehood curves written to {out_prefix}.loglkl.txt")
# summarize results
assert rg_list[-1] == 1, "r=1 (r.hsq) should be included"
dense_rg_list = np.linspace(min(rg_list), max(rg_list), 1001)
dense_loglkl_list = CubicSpline(rg_list, loglkl_list)(dense_rg_list)
dict_summary = {
"n": n_indiv,
"rg_mode": dense_rg_list[dense_loglkl_list.argmax()],
"rg_hpdi(50%)": admix.data.hdi(dense_rg_list, dense_loglkl_list, ci=0.5),
"rg_hpdi(95%)": admix.data.hdi(dense_rg_list, dense_loglkl_list, ci=0.95),
"rg=1_pval": stats.chi2.sf((dense_loglkl_list.max() - dense_loglkl_list[-1]) * 2, df=1),
}
assert (weight_file is None) == (
freq_file is None
), "weight_file and freq_file should be provided together"
if (weight_file is not None) and (freq_file is not None):
scale_factor = admix.tools.gcta.calculate_hsq_scale(
weight_file=weight_file,
freq_file=freq_file,
freq_col=freq_col,
index_col=index_col,
)
admix.logger.info(f"Computed hsq scale factor = {scale_factor:.3g}")
if scale_factor is not None:
dict_reml = admix.tools.gcta.read_reml(os.path.join(est_dir, f"{rg_str}100"))
est_hsq, est_hsq_var = admix.tools.gcta.estimate_hsq(dict_reml, scale_factor=scale_factor)
est_hsq_stderr = np.sqrt(est_hsq_var)
dict_summary["hsq_est"] = est_hsq
dict_summary["hsq_stderr"] = est_hsq_stderr
# write summary
with open(out_prefix + ".summary.json", "w") as f:
json.dump(dict_summary, f, indent=4)
admix.logger.info(f"Summary written to {out_prefix}.summary.json")
def meta_analyze_genet_cor(loglkl_files):
"""Meta-analyze the results of genetic correlation analysis.
Parameters
----------
loglkl_files : str
file patterns of log-likelihood curve files
"""
loglkl_files = glob.glob(loglkl_files)
rg_list = None
total_dense_loglik: np.ndarray = 0
total_n = 0
for f in loglkl_files:
df_loglkl = pd.read_csv(f, sep="\t")
if rg_list is None:
rg_list = df_loglkl["rg"].values
dense_rg_list = np.linspace(min(rg_list), max(rg_list), 1001)
else:
assert np.all(rg_list == df_loglkl["rg"].values)
total_dense_loglik += CubicSpline(rg_list, df_loglkl["loglkl"].values)(dense_rg_list)
# load f.replace(".loglkl.txt", ".summary.json")
total_n += json.load(open(f.replace(".loglkl.txt", ".summary.json")))["n"]
rg_mode = dense_rg_list[total_dense_loglik.argmax()]
pval_rg_1 = stats.chi2.sf((total_dense_loglik.max() - total_dense_loglik[-1]) * 2, df=1)
print(f"Meta-analysis results across {len(loglkl_files)} files")
print("-" * 37)
print(f"rg mode = {rg_mode:4g}")
for ci in [0.5, 0.95]:
rg_hpdi = admix.data.hdi(dense_rg_list, total_dense_loglik, ci=ci)
if isinstance(rg_hpdi, List):
intervals = [f"[{i[0]:.4g}, {i[1]:.4g}]" for i in rg_hpdi]
admix.logger.warning(
f"Multiple intervals for ci={ci}, indicating some issues of model fit."
" Please inspect log-liklihood curves for each trait."
)
print(f"{ci * 100:g}% HPDI = {' '.join(intervals)}")
else:
assert len(rg_hpdi) == 2
print(f"{ci * 100:g}% HPDI = [{rg_hpdi[0]:.4g}, {rg_hpdi[1]:.4g}]")
print(f"Null (rg = 1) p-value: {pval_rg_1:.4g}")
print(f"Average N={int(np.round(total_n / len(loglkl_files)))}")
def admix_grm_rho(prefix: str, out_dir: str, rho_list=np.linspace(0, 1.0, 21)) -> None:
"""
DEPRECATED. Will be removed in future versions.
Build the GRM for a given rho list
Parameters
----------
prefix : str
Prefix of the GRM files, with .K1.grm.bin and .K2.grm.bin
out_dir : str
folder to store the output files
rho_list : list, optional
List of rho values, by default np.linspace(0, 1.0, 21)
"""
log_params("admix-grm-rho", locals())
K1, df_id1, n_snps1 = admix.tools.gcta.read_grm(prefix + ".K1")
K2, df_id2, n_snps2 = admix.tools.gcta.read_grm(prefix + ".K2")
assert df_id1.equals(df_id2)
assert np.allclose(n_snps1, n_snps2)
df_id = df_id1
n_snps = n_snps1
os.makedirs(out_dir, exist_ok=True)
for rho in tqdm(rho_list):
K = K1 + K2 * rho
name = os.path.join(out_dir, f"rho{int(rho * 100)}")
admix.tools.gcta.write_grm(
name,
K=K,
df_id=df_id,
n_snps=n_snps,
)
def estimate_genetic_cor(
pheno: str,
out_dir: str,
grm_dir: str = None,
grm_prefix: str = None,
quantile_normalize: bool = True,
n_thread: int = 2,
):
"""
DEPRECATED. Will be removed in future versions.
Estimate genetic correlation from a set of GRM files (with different rho values)
Parameters
----------
pheno : str
phenotype file, the 1st column contains ID, 2nd column contains phenotype, and
the rest of columns are covariates.
out_dir : str
folder to store the output files
grm_dir : str
folder containing GRM files
quantile_normalize: bool
whether to perform quantile normalization for both phenotype and each column of covariates
n_thread : int, optional
number of threads, by default 2
"""
log_params("estimate-genetic-cor", locals())
# either grm_dir or grm_prefix must be specified
assert (grm_dir is not None) + (
grm_prefix is not None
) == 1, "Either grm_dir or grm_prefix must be specified"
# compile phenotype and covariates
df_pheno = pd.read_csv(pheno, sep="\t", index_col=0)
df_pheno.index = df_pheno.index.astype(str)
# subset for individuals with non-nan value in df_trait
trait_col = df_pheno.columns[0]
covar_cols = df_pheno.columns[1:]
# filter out individuals with missing phenotype
df_pheno = df_pheno[df_pheno[trait_col].notna()]
df_trait = df_pheno[[trait_col]].copy()
df_covar = df_pheno[covar_cols].copy()
df_covar = admix.data.convert_dummy(df_covar)
if quantile_normalize:
# perform quantile normalization
for col in df_trait.columns:
df_trait[col] = admix.data.quantile_normalize(df_trait[col])
for col in df_covar.columns:
df_covar[col] = admix.data.quantile_normalize(df_covar[col])
# fill na with column mean
df_covar.fillna(df_covar.mean(), inplace=True)
df_id = pd.DataFrame(
{"FID": df_trait.index.values, "IID": df_trait.index.values},
index=df_trait.index.values,
)
df_trait = pd.merge(df_id, df_trait, left_index=True, right_index=True)
df_covar = pd.merge(df_id, df_covar, left_index=True, right_index=True)
os.makedirs(out_dir, exist_ok=True)
if grm_dir is not None:
# fit different rho
grm_prefix_list = [
p.split("/")[-1][: -len(".grm.bin")]
for p in glob.glob(os.path.join(grm_dir, "*.grm.bin"))
]
else:
assert grm_prefix is not None
grm_dir = os.path.dirname(grm_prefix)
grm_prefix_list = [grm_prefix.split("/")[-1]]
for grm_prefix in grm_prefix_list:
grm = os.path.join(grm_dir, grm_prefix)
out_prefix = os.path.join(out_dir, grm_prefix)
if not os.path.exists(out_prefix + ".hsq"):
admix.tools.gcta.reml(
grm_path=grm,
df_pheno=df_trait,
df_covar=df_covar,
out_prefix=out_prefix,
n_thread=n_thread,
est_fix=True,
)