Source code for mpca.utils.datahandling

import h5py
import numpy as np

[docs]def write_h5(filename, dataname, data):
	'''
	Write data to a h5 file.
	NOTE: Replaces dataset dataname if already exists.

	:param filename: Name of the file on disk.
	:param dataname: Name the datataset.
	:param data: The data to store.
	'''
	with h5py.File(filename, 'a') as hf:
		try:
			hf.create_dataset(dataname,  data=data)
		except RuntimeError:
			print("Replacing dataset "+dataname + " in "+ filename)
			del hf[dataname]
			hf.create_dataset(dataname,  data=data)


[docs]def read_h5(filename, dataname):
	'''
	Read data from a h5 file.

	:param filename: Name of the file on disk.
	:param dataname: Name of the dataset.
	:return: The data.
	'''
	with h5py.File(filename, 'r') as hf:
		data = hf[dataname][:]
	return data

[docs]def get_pop_superpop_list(file):
	'''
	Get a list mapping populations to superpopulations from file.

	:param file: directory, filename and extension of a file mapping populations to superpopulations.
	:return: a (n_pops) x 2 list

	Assumes file contains one population and superpopulation per line, separated by ","  e.g.

	Kyrgyz,Central/South Asia

	Khomani,Sub-Saharan Africa

	'''

	pop_superpop_list = np.genfromtxt(file, usecols=(0,1), dtype=str, delimiter=",")
	return pop_superpop_list


[docs]def read_from_EIGENSTRAT(genofile, popfile):
	'''
	Read genotypes from eigenstrat file, and sample ID and population ID from another file.
	:param genofile: text file with genotypes represented as 0,1,2 (9 for missing data) in a (n_markers x n_samples) order
	:param popfile: text file containing sample IDs and their population IDs. E.g. a plink fam file,
              or a file that contains one line for each sample with the following information: "populationID sampleID"
	:return: genotypes (n_samples x n_markers)
			 ind_pop_list: array mapping individual IDs to populations so that ind_pop_list[i,0] is the individual ID
		   of sample i, and ind_pop_list[i,1] is the population of sample i, in the same order as in genotypes
	NOTE: the genotypes are transposed
	'''
	ind_pop_list = np.genfromtxt(popfile, usecols=(1,0), dtype=str)
	n_samples = len(ind_pop_list)
	# read genotypes from file
	genotypes = np.genfromtxt(genofile, delimiter = np.repeat(1, n_samples)).T
	return genotypes, ind_pop_list

[docs]def normalize_genos_EIGENSTRATstyle(genodata):
	'''
	Normalize genotypes as described in EIGENSTRAT article

	Principal components analysis corrects for stratification in genome-wide association studies
	Alkes L Price, Nick J Patterson, Robert M Plenge, Michael E Weinblatt, Nancy A Shadick & David Reich
	Nature Genetics
	2006

	Centering by mean and normalization over rows (over SNPs).

	Missing data exluded from normalization and set to value 0.

	:param genodata: genotypes represented as 0,1,2, missing values encoded as 9
	:type genodata: array, shape (n_markers x n_samples)
	:return: Centered and normalized genodata, transposed to (n_samples x n_markers).
	'''

	# mean_test = np.mean(genotypes_train, axis=0)
	# X_new = genotypes_train-mean_test
	# print(mean_test.shape)
	# print(mean_test)



	for r in range(genodata.shape[0]):
		snp_obs = 0.0
		snp_sum = 0.0

		genotypes = genodata[r,:]


		for n in range(len(genotypes)):
			if genotypes[n] < 9.0:
				snp_obs  += 1.0
				snp_sum += genotypes[n]

		snp_avg = snp_sum / snp_obs

		allele_freq_est = (snp_sum + 1.0) / ((2.0 * snp_obs) + 2.0)
		allele_freq_std  = allele_freq_est * (1.0-allele_freq_est)

		if (allele_freq_std > 0.0):
			f = 1.0 / np.sqrt(allele_freq_std)
		else:
			f = 1.0

		for n in range(len(genotypes)):
			if genotypes[n] < 9.0:
				genotypes[n] = (genotypes[n] - snp_avg) * f
			else:
				genotypes[n] = 0

		genodata[r,:] = genotypes

	return genodata.T

[docs]def remove_values(data, missing_fraction, missing_val=-1.0):
	'''
	Randomly set missing_fraction of the data to missing.

	:param data: the data
	:type data: array, shape (n_samples x n_variables)
	:param missing_fraction: fraction of data to set to missing
	:param missing_val: the value used to represent missing data
	:return:
	'''
	n_variables = data.shape[1]
	n_missing = int(n_variables*missing_fraction)
	data = np.copy(data)
	for s in range(data.shape[0]):
		missing_idx = np.random.choice(n_variables, size=n_missing, replace=False)
		data[s,missing_idx] = missing_val
	return data