[docs]defassoc(pfile:str,pheno:str,out:str,method:Union[str,List[str]]="ATT",family:str="quant",quantile_normalize:bool=False,snp_list:str=None,fast:bool=True,):""" Perform association testing. Parameters ---------- pfile : str Prefix to the PLINK2 file (.pgen should not be added). When using a method requiring local ancestry, a matching :code:`<pfile>.lanc` file should also exist. pheno : str Path to the phenotype file. The text file should be space delimited with header and one individual per row. 1st column: individual ID. 2nd column: phenotype values. 3rd - nth columns: covariates. NaN should be encoded as "NA" and these individuals will be removed in the analysis. Binary phenotype should be encoded as 0 and 1, and :code:`--family binary` should be used. All columns will be used for the analysis. NaN should be encoded as "NA" and NaN will be imputed with the mean of each covariate. Categorical covariates will be converted to one hot encodings internally. out : str Path the output file. :code:`<out>.<method>.assoc` will be created. method : Union[str, List[str]] Method to use for association analysis (default ATT). Other methods include: TRACTOR, ADM, SNP1, HET family : str Family to use for association analysis (default quant). One of :code:`quant` or :code:`binary`. quantile_normalize : bool Whether to quantile normalize the phenotype and every covariate. When :code:`--family binary` is used, quantile normalization will only be applied to covariates. snp_list : str Path to the SNP list file. Each line should be a SNP ID. Only SNPs in the list will be used for the analysis. fast : bool Whether to use fast mode (default True). """log_params("assoc",locals())assertfamilyin["quant","binary"],"family must be either quant or binary"# TODO: infer block size using memory use in dask-pgen readdset=admix.io.read_dataset(pfile)admix.logger.info(f"{dset.n_snp} SNPs and {dset.n_indiv} individuals are loaded")df_pheno=pd.read_csv(pheno,delim_whitespace=True,index_col=0,low_memory=False)df_pheno.index=df_pheno.index.astype(str)pheno_col=df_pheno.columns[0]covar_cols=df_pheno.columns[1:]iflen(covar_cols)==0:covar_cols=Noneadmix.logger.info(f"trait column: {pheno_col}")admix.logger.info(f"covariate columns: {covar_cols}")dset.append_indiv_info(df_pheno,force_update=True)# retain only individuals with non-missing phenotype,# or with non-completely missing covariateindiv_mask=~dset.indiv[pheno_col].isna().valuesifcovar_colsisnotNone:covar_mask=~(dset.indiv[covar_cols].isna().values.all(axis=1))indiv_mask&=covar_maskdset=dset[:,indiv_mask]admix.logger.info(f"{dset.n_indiv} individuals left ""after filtering for missing phenotype, or completely missing covariate")# filter for SNPsifsnp_listisnotNone:withopen(snp_list,"r")asf:filter_snp_list=[line.strip()forlineinf]n_filter_snp=len(filter_snp_list)filter_snp_list=dset.snp.index[dset.snp.index.isin(filter_snp_list)]iflen(filter_snp_list)<n_filter_snp:admix.logger.warning(f"{n_filter_snp-len(filter_snp_list)} SNPs in {snp_list} are not in the dataset")dset=dset[filter_snp_list]admix.logger.info(f"{dset.n_snp} SNPs and {dset.n_indiv} individuals in the analysis")ifisinstance(method,str):method=[method]# extract pheno_values and covar_valuespheno_values=dset.indiv[pheno_col].valuesifcovar_colsisnotNone:covar_values=admix.data.convert_dummy(dset.indiv[covar_cols]).valueselse:covar_values=Noneifquantile_normalize:iffamily=="quant":pheno_values=admix.data.quantile_normalize(pheno_values)ifcovar_valuesisnotNone:foriinrange(covar_values.shape[1]):covar_values[:,i]=admix.data.quantile_normalize(covar_values[:,i])dict_rls={}forminmethod:admix.logger.info(f"Performing association analysis with method {m}")dict_rls[m]=admix.assoc.marginal(dset=dset,pheno=pheno_values,cov=covar_values,method=m,family="logistic"iffamily=="binary"else"linear",fast=fast,)formindict_rls:dict_rls[m].to_csv(f"{out}.{m}.assoc",sep="\t",float_format="%.6g",na_rep="NA")logger.info(f"Output written to {out}.{m}.assoc")