Source code for admix.cli._assoc

import admix
import pandas as pd
from typing import Union, List
from admix import logger
from ._utils import log_params


[docs]def assoc( pfile: str, pheno: str, out: str, method: Union[str, List[str]] = "ATT", family: str = "quant", quantile_normalize: bool = False, snp_list: str = None, fast: bool = True, ): """ Perform association testing. Parameters ---------- pfile : str Prefix to the PLINK2 file (.pgen should not be added). When using a method requiring local ancestry, a matching :code:`<pfile>.lanc` file should also exist. pheno : str Path to the phenotype file. The text file should be space delimited with header and one individual per row. 1st column: individual ID. 2nd column: phenotype values. 3rd - nth columns: covariates. NaN should be encoded as "NA" and these individuals will be removed in the analysis. Binary phenotype should be encoded as 0 and 1, and :code:`--family binary` should be used. All columns will be used for the analysis. NaN should be encoded as "NA" and NaN will be imputed with the mean of each covariate. Categorical covariates will be converted to one hot encodings internally. out : str Path the output file. :code:`<out>.<method>.assoc` will be created. method : Union[str, List[str]] Method to use for association analysis (default ATT). Other methods include: TRACTOR, ADM, SNP1, HET family : str Family to use for association analysis (default quant). One of :code:`quant` or :code:`binary`. quantile_normalize : bool Whether to quantile normalize the phenotype and every covariate. When :code:`--family binary` is used, quantile normalization will only be applied to covariates. snp_list : str Path to the SNP list file. Each line should be a SNP ID. Only SNPs in the list will be used for the analysis. fast : bool Whether to use fast mode (default True). """ log_params("assoc", locals()) assert family in ["quant", "binary"], "family must be either quant or binary" # TODO: infer block size using memory use in dask-pgen read dset = admix.io.read_dataset(pfile) admix.logger.info(f"{dset.n_snp} SNPs and {dset.n_indiv} individuals are loaded") df_pheno = pd.read_csv(pheno, delim_whitespace=True, index_col=0, low_memory=False) df_pheno.index = df_pheno.index.astype(str) pheno_col = df_pheno.columns[0] covar_cols = df_pheno.columns[1:] if len(covar_cols) == 0: covar_cols = None admix.logger.info(f"trait column: {pheno_col}") admix.logger.info(f"covariate columns: {covar_cols}") dset.append_indiv_info(df_pheno, force_update=True) # retain only individuals with non-missing phenotype, # or with non-completely missing covariate indiv_mask = ~dset.indiv[pheno_col].isna().values if covar_cols is not None: covar_mask = ~(dset.indiv[covar_cols].isna().values.all(axis=1)) indiv_mask &= covar_mask dset = dset[:, indiv_mask] admix.logger.info( f"{dset.n_indiv} individuals left " "after filtering for missing phenotype, or completely missing covariate" ) # filter for SNPs if snp_list is not None: with open(snp_list, "r") as f: filter_snp_list = [line.strip() for line in f] n_filter_snp = len(filter_snp_list) filter_snp_list = dset.snp.index[dset.snp.index.isin(filter_snp_list)] if len(filter_snp_list) < n_filter_snp: admix.logger.warning( f"{n_filter_snp - len(filter_snp_list)} SNPs in {snp_list} are not in the dataset" ) dset = dset[filter_snp_list] admix.logger.info( f"{dset.n_snp} SNPs and {dset.n_indiv} individuals in the analysis" ) if isinstance(method, str): method = [method] # extract pheno_values and covar_values pheno_values = dset.indiv[pheno_col].values if covar_cols is not None: covar_values = admix.data.convert_dummy(dset.indiv[covar_cols]).values else: covar_values = None if quantile_normalize: if family == "quant": pheno_values = admix.data.quantile_normalize(pheno_values) if covar_values is not None: for i in range(covar_values.shape[1]): covar_values[:, i] = admix.data.quantile_normalize(covar_values[:, i]) dict_rls = {} for m in method: admix.logger.info(f"Performing association analysis with method {m}") dict_rls[m] = admix.assoc.marginal( dset=dset, pheno=pheno_values, cov=covar_values, method=m, family="logistic" if family == "binary" else "linear", fast=fast, ) for m in dict_rls: dict_rls[m].to_csv( f"{out}.{m}.assoc", sep="\t", float_format="%.6g", na_rep="NA" ) logger.info(f"Output written to {out}.{m}.assoc")