// Core Genotype caller for the BSNP software package.
//
// Simply Compile with g++ -O3 -o BSNP *.cpp
//
//      Written by Brad Gulko, Siepel Lab, Cornell University department of Biological Statisticational Biology
//              http://compgen.bscb.cornell.edu/~acs/
//
//      Copyright 2010 Brad Gulko & Cornell University.
//              bgulko@cs.cornell.edu / bg279@cornell.edu.
//

#include <algorithm>	// sort()

#include "ShortReadGenotype.h"
#include <string.h>
#include <iostream>
#include <fstream>
using namespace std;

// Assymetrical error functions
//		P(Act Nuc | Called Nuc) when CalledNuc is wrong.
//
//          Act Nuc           
// Called
//   Nuc

// So, for example, 
//	P( Nuc = T | NucCalled=C ^ Tech=454 ^ Nuc!=NucCalled) = 50.863%
//  P( Nuc = C | NucCalled=G ^ Tech=Ill ^ Nuc!=NucCalled) = 20.975%

double	ShortReadGenotype::_errorDistUnknown[4][4] = { 
/*             A       C        G         T  */
/* A */	{     1.0, 1.0/3.0, 1.0/3.0, 1.0/3.0 },
/* C */	{ 1.0/3.0,     1.0, 1.0/3.0, 1.0/3.0 },
/* G */	{ 1.0/3.0, 1.0/3.0,     1.0, 1.0/3.0 },
/* T */	{ 1.0/3.0, 1.0/3.0, 1.0/3.0,     1.0 } };

// From 08-Watson
double	ShortReadGenotype::_errorDist454[4][4] = { 
	{     1.0, 0.31692, 0.36637, 0.31671 },
	{ 0.24447,     1.0, 0.24690, 0.50863 },			// fixeda typo in this line V2.15
	{ 0.46408, 0.28430,     1.0, 0.25162 },
	{ 0.31437, 0.42366, 0.26197,     1.0 } };

// From 09- Venter
double	ShortReadGenotype::_errorDistSanger[4][4] = {
	{     1.0, 0.27040, 0.32958, 0.40002 },
	{ 0.32431,     1.0, 0.20741, 0.46828 },
	{ 0.41232, 0.25228,     1.0, 0.33540 },
	{ 0.38432, 0.39856, 0.21712,     1.0 } };

// From 10 ABT
double	ShortReadGenotype::_errorDistSolid[4][4] = {
	{     1.0, 0.26457, 0.36185, 0.37358 },
	{ 0.26532,     1.0, 0.23171, 0.50297 },
	{ 0.48444, 0.23156,     1.0, 0.28400 },
	{ 0.36736, 0.37168, 0.26096,     1.0 } };

// MLE of 01, 02, 03, 06, 07
double	ShortReadGenotype::_errorDistIllumina[4][4] = {
	{     1.0, 0.38082, 0.28480, 0.33438 },
	{ 0.39379,     1.0, 0.20673, 0.39948 },
	{ 0.40040, 0.20975,     1.0, 0.38985 },
	{ 0.33122, 0.28814, 0.38064,     1.0 } };

double COND_PROBS[4][10]= { 
		//  aa,  ac,  ag,  at,  cc,  cg,  ct,  gg,  gt, tt
/* a */	{  1.0, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 },
/* c */	{  0.0, 0.5, 0.0, 0.0, 1.0, 0.5, 0.5, 0.0, 0.0, 0.0 },
/* g */	{  0.0, 0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 1.0, 0.5, 0.0 },
/* t */	{  0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.5, 1.0 } };


int SNP_TYPES[4][10]= { 
		//  aa,  ac,  ag,  at,  cc,  cg,  ct,  gg,  gt, tt
/* a */	{    0,   1,   1,   1,   2,   3,   3,   2,   3,   2 },
/* c */	{    2,   1,   3,   3,   0,   1,   1,   2,   3,   2 },
/* g */	{    2,   3,   1,   3,   2,   1,   3,   0,   1,   2 },
/* t */	{    2,   3,   3,   1,   2,   3,   1,   2,   1,   0 } };

bool
ShortReadGenotype::Init( vector<string> DiploidCombos, vector<double> DiploidPriors, vector<string> HaploidStates, vector<double> HaploidPriors, int SequencingTechnology ) {
	bool	ok = true;
	long	i;
	_diploidCombos		= DiploidCombos;
	_diploidPriors		= DiploidPriors;
	_haploidCombos		= HaploidStates;
	_haploidPriors		= HaploidPriors;

	_diploidPriorsLP.resize( _diploidPriors.size() );
	_haploidPriorsLP.resize( _haploidPriors.size() );
	for (i=0; i<4;  i++) {_haploidPriorsLP[i] = _haploidPriors[i]; };	// Convert double to LogProb
	for (i=0; i<10; i++) {_diploidPriorsLP[i] = _diploidPriors[i]; };	// Convert double to LogProb

	_sortedStates.resize(10);
	_hasSortedStates = false;	// sort only if needed...

	// Now Setup error handeling. It turns out that when a base is mis-sequenced the probability 
	//	distribution over the remaining 3 nucleotides is not uniform. Furthermore, this distribution
	//	varries from sequencing technology to technology. While it may also vary based on sample prep,
	//	sequence parameters and context (such as flanking bases, I have evidence of this), this attempts
	//	to capture the first order distribution among errors, based on a limited sample. the most justified is 
	//  the illumina data, based on 5 Genomes. Note this was not used in our NatGen publication and has not
	// been peer reviewed, if in dobut, use the non-informative distribution, it is also faster....--BG
	// 0 = UNKNOWN, 1 = Illumina, 2 = Sanger, 3 = 454, 4 = SOLiD (converted to Nuc Space)
	switch( SequencingTechnology  ) {
		case 0 :		_errDist = &(ShortReadGenotype::_errorDistUnknown);		break;	// non-informative prior
		case 1 :		_errDist = &(ShortReadGenotype::_errorDistIllumina);	break;	// Illumina / Solexa
		case 2 :		_errDist = &(ShortReadGenotype::_errorDistSanger);		break;	// Sanger
		case 3 :		_errDist = &(ShortReadGenotype::_errorDist454);			break;	// 454
		case 4 :		_errDist = &(ShortReadGenotype::_errorDistSolid);		break;	// AB SolID
		default :		_errDist = &(ShortReadGenotype::_errorDistUnknown); 
																	ok = false; break;// invalid technology
	};

	return( ok );
}

void	
ShortReadGenotype::GatherCountData( double *Total, double *Unvoided, double *Partial, double *Effective ) const {
	int i;
	for (i=0; i<5; i++) {
		Total[i]		+= (double) _totalReads[i];
		Unvoided[i]		+= (double) _unvoidedReads[i];
		Partial[i]		+=			_partialReads[i];
		Effective[i]	+=			_effectiveReads[i]; }
	return;
}

/*
bool 
ShortReadGenotype::ResizeStates( long StateSetSize ) {
	long cur_size;
	cur_size = _CurrentReads.size();
	//if (StateSetSize > 2048) {
	//	printf("Scoobie\n"); }
	if (cur_size > StateSetSize) return( true );
			_CurrentReads.resize( StateSetSize * 2 );
	cur_size = _CurrentReads.size();
	for (long i=0; i<cur_size; i++) {
		_CurrentReads[i].Init( _haploidCombos, _haploidPriors ); }
	return( true );
}
*/
	
long		
ShortReadGenotype::GetSNPClass( long RefHaploid, long SNPDiploid ) {
	long out = 0;

	out = SNP_TYPES[RefHaploid][SNPDiploid];
	return( out );
/*
	// TODO THis could be done faster with a table
	//		class 0 X -> XX (not a SNIP (ie A -> AA)
	//		class 1 X -> XY (ie A -> AG)
	//		class 2 X -> YY (ie A -> GG)
	//		class 3 X -> YZ ( ie A ->CT) 
	switch (RefHaploid) {
		case 0:
			// A
			switch (SNPDiploid) {
				case 0: break;
				case 1: case 2: case 3: out = 1; break;
				case 4: case 7: case 9: out = 2; break;
				default: out = 3; break; }
			break;
		case 1:
			// C
			switch (SNPDiploid) {
				case 4: break;
				case 1: case 5: case 6: out = 1; break;
				case 0: case 7: case 9: out = 2; break;
				default: out = 3; break; }
			break;
		case 2:
			// G
			switch (SNPDiploid) {
				case 7: break;
				case 2: case 5: case 8: out = 1; break;
				case 0: case 4: case 9: out = 2; break;
				default: out = 3; break; }
			break;
		case 3:
			// T
			switch (SNPDiploid) {
				case 9: break;
				case 3: case 6: case 8: out = 1; break;
				case 0: case 4: case 7: out = 2; break;
				default: out = 3; break; };
			break;
		default:
			out = 0; break; }

	return( out );
	*/
}


void
ShortReadGenotype::GetHaploidPriors(vector<string> &StateNames, vector<double> &HaploidPriors, double Phi ) {
	double p_A, p_C, p_G, p_T;
	// \phi   ===> Background GC-content (~0.4).

	HaploidPriors.resize( 5 );
	StateNames.resize( 5 );

	p_A = p_T = (1.0 - Phi )	/ 2.0;
	p_G = p_C =        Phi		/ 2.0;

	HaploidPriors[0]	= p_A;
	HaploidPriors[1]	= p_C;
	HaploidPriors[2]	= p_G;
	HaploidPriors[3]	= p_T;
	HaploidPriors[4]	= 0.0;

	StateNames[0]		= "A";
	StateNames[1]		= "C";
	StateNames[2]		= "G";
	StateNames[3]		= "T";
	StateNames[4]		= "N";

	return;
}

void
ShortReadGenotype::GetDiploidPriors(vector<string> &StateNames, vector<double> &DiploidPriors, double Phi, double Kappa, double P ) {
/*
P_{0}   ===> Probability of a homozygote; equivalent to homozygosity in a randomly mating population; Roughly on the order of ~1/1000
\kappa  ===> Transition/transversion ratio (~5).
\phi    ===> Background GC-content (~0.4).

p(AA) = p(TT) = P_{0}(1-\phi)/2
p(CC) = p(GG) = P_{0}\phi/2

p(AG) = p(CT) = (1-P_{0})\kappa\phi(1-\phi)/(1+2\kappa\phi(1-\phi))
p(AC) = p(GT) = (1-P_{0})\phi(1-\phi)/(1+2\kappa\phi(1-\phi))

p(AT) = (1-P_{0})(1-\phi)^{2}/(1+2\kappa\phi(1-\phi))
p(GC) = (1-P_{0})\ph^{2}/(1+2\kappa\phi(1-\phi))


(1+2\kappa\phi(1-\phi)) normalizes the 4 heterozygous possibilities; equal to their sum.
*/
	double phi		= Phi;
	double kappa	= Kappa;
	double P0		= 1.0- P;
	double denom, norm;
	double p_AA, p_AC, p_AG, p_AT, p_CC, p_CG, p_CT, p_GG, p_GT, p_TT;
	//
	p_AA = p_TT = P0*(1-phi)/2;
	p_CC = p_GG = P0*    phi/2;
	//
	denom = (1+2*kappa*phi*(1.0-phi));
	p_AG = p_CT = (1.0-P0)*kappa*phi*(1-phi) / denom;
	p_AC = p_GT = (1.0-P0)      *phi*(1-phi) / denom;
	//
	p_AT = (1.0-P0)*(1.0-phi)*(1.0-phi)    / denom;
	p_CG = (1.0-P0)*     phi *     phi     / denom;

	norm = 	p_AA + p_AC + p_AG + p_AT + p_CC + p_CG + p_CT + p_GG + p_GT + p_TT;

	// force renorm to deal with any roundoff errors.
	DiploidPriors.resize( 10 );
	DiploidPriors[ 0] = p_AA/norm;
	DiploidPriors[ 1] = p_AC/norm;
	DiploidPriors[ 2] = p_AG/norm;
	DiploidPriors[ 3] = p_AT/norm;
	DiploidPriors[ 4] = p_CC/norm;
	DiploidPriors[ 5] = p_CG/norm;
	DiploidPriors[ 6] = p_CT/norm;
	DiploidPriors[ 7] = p_GG/norm;
	DiploidPriors[ 8] = p_GT/norm;
	DiploidPriors[ 9] = p_TT/norm;

	StateNames.resize( 10 );
	StateNames[ 0]	  = "AA";
	StateNames[ 1]	  = "AC";
	StateNames[ 2]	  = "AG";
	StateNames[ 3]	  = "AT";
	StateNames[ 4]	  = "CC";
	StateNames[ 5]	  = "CG";
	StateNames[ 6]	  = "CT";
	StateNames[ 7]	  = "GG";
	StateNames[ 8]	  = "GT";
	StateNames[ 9]	  = "TT";

	return;
}
bool		
ShortReadGenotype::ReadPriors( const string &FileName, vector<string> &HaploidNames, vector<double> &HaploidPriors, vector<string> &DiploidNames, vector<double> &DiploidPriors ){
	int i;

	if (HaploidNames.size() < 5 ) {
		HaploidNames.resize( 5 );
		HaploidNames[0]		= "A";
		HaploidNames[1]		= "C";
		HaploidNames[2]		= "G";
		HaploidNames[3]		= "T";
		HaploidNames[4]		= "N"; }

	if (DiploidNames.size() < 10 ) {
		DiploidNames.resize( 10 );
		DiploidNames[ 0]	  = "AA";
		DiploidNames[ 1]	  = "AC";
		DiploidNames[ 2]	  = "AG";
		DiploidNames[ 3]	  = "AT";
		DiploidNames[ 4]	  = "CC";
		DiploidNames[ 5]	  = "CG";
		DiploidNames[ 6]	  = "CT";
		DiploidNames[ 7]	  = "GG";
		DiploidNames[ 8]	  = "GT";
		DiploidNames[ 9]	  = "TT"; }

	HaploidPriors.resize( 5 );	for (i=0; i<5;  i++) HaploidPriors[i]=0.0;
	DiploidPriors.resize( 10 ); for (i=0; i<10; i++) DiploidPriors[i]=0.0;

	{
		// read valeus from file
		char c; 
		string name;
		double val;

		std::ifstream fin;
		fin.open( FileName.c_str() );
		while (fin.good()) {
			c = '#';
			fin >> c;
			// blank lines
			if (c == '\n') continue;
			if (c == '\r') continue;
			if (c == '#') { getline(fin, name ); continue; } // comment
			name = "";
			fin >> name; name = c + name;	// genotype
			name = name.substr(0,2);
			if (name == "") break;
			fin >> val;						// probability
			for (i=0; i<10; i++) {
				if (name == DiploidNames[i]) {
					DiploidPriors[i] = val ; break; }
			}
			if (i>9) {
				cerr << "Unable to parse Diploid Prior Name :" << name << ": from file " << FileName << " terminating" << endl;
				cerr << "\tThere are genrally 10 diploid priors AA AC AG AT CC CG CT GG GT TT" << endl;
				cerr << "\tline format is Name Prob endl for exxample  'AA .145\\n'" << endl;
				cerr << "\tall 10 must be specified." << endl;
				return( false ); }
		}
	}

	{
		// normalize the Diploid prior.
		double norm = 0.0;
		for (i=0;i<10;i++) {
			norm += DiploidPriors[i]; 
			if ( (DiploidPriors[i] <= 0.0) || (DiploidPriors[i] >= 1.0) ) {
				cerr << "WARNING: Diploid prior " << DiploidNames[i] << " has value " << DiploidPriors[i] << ", values >0 <1 are reccomended." << endl; 
				if (DiploidPriors[i] < 0.0) {
					cerr << "ERROR: negative probability not allowed " << DiploidNames[i] << " has value " << DiploidPriors[i] << ". Exiting." << endl; 
					return( false ); }
			}
		}
		for (i=0;i<10;i++) DiploidPriors[i] /= norm;
		DiploidPriors[9] = 1.0;
		for (i=0;i<9;i++) DiploidPriors[9] -= DiploidPriors[i];	// deal with roundoff
	}

	// extract haploid prior
	{
		int j; double norm;
		for (i=0;i<10;i++) {
			for (j=0;j<4;j++) {
				if (DiploidNames[i].substr(0,1) == HaploidNames[j])
					HaploidPriors[j] += DiploidPriors[i];
				if (DiploidNames[i].substr(1,1) == HaploidNames[j])
					HaploidPriors[j] += DiploidPriors[i];
			}
		}

		// renormalize
		norm = 0;
		for (j=0;j<4;j++) norm += HaploidPriors[j];
		for (j=0;j<4;j++) HaploidPriors[j] /= norm;
		// deal with roundoff
		HaploidPriors[3] = 1.0;   
		for (j=0;j<3;j++) HaploidPriors[3] -= HaploidPriors[j];
	}

	return( true );
}
	
void
ShortReadGenotype::SetReadStates( long NumReads, const char *Reads, const char *ReadQual, const char *AlignQual ){
	double			state_distrib[5];
	unsigned long	i, num_reads = NumReads;
	double			obs_prob, alt_prob, effective_reads;
	char			read_char_c, read_qual_c, alin_qual_c;
	long			read_char_i;
	double			read_qual_r, alin_qual_r, alin_qual_r_i;


	if (_readStates.size() <  num_reads) _readStates.resize( num_reads * 2 );


	//_totalReads = num_reads;	// Number of nucleotides read out of file at this locus
	// non_reads = 0;			// number of completely invalid nucleotides encountered
	// _effectiveReads = 0.0;		// gauge of how much data we have, effectiuvely, hoew many 'perfect' reads is our data equivelent to.
	// _partialReads	= 0.0;
	for (i=0; i<5; i++) {
		_totalReads[i]		= _unvoidedReads[i]		= 0;
		_partialReads[i]	= _effectiveReads[i]	= 0.0; }


	for (i=0; i<num_reads; i++) {
		// fetch the nucleotide, read quality and alignment quality for read I pst the current position
		read_char_c = Reads[i];
		read_qual_c = ReadQual[i];
		alin_qual_c = AlignQual[ i ];
		read_char_i = NucCharToIndex(	read_char_c ); _totalReads[read_char_i]++;
		read_qual_r = QualCharToReal(	read_qual_c );
		alin_qual_r = QualCharToReal(	alin_qual_c );

		// A non-read is an N nucleotide, or a 0 read qualioty or a 0 alignment quality.
		if ((read_char_i==4) || (read_qual_r==1.0) || (alin_qual_r==1.0)) {
			// non_reads++;
			state_distrib[0] = state_distrib[1] = state_distrib[2] = state_distrib[3] = 0.25;
			state_distrib[4] = 1.0;
		} else { 
			_unvoidedReads[ read_char_i ]++;

			obs_prob		=	(1.0 - read_qual_r);			// proability that 'observed nucleotide is correct
			alin_qual_r_i	=	(1.0 - alin_qual_r);
			// alt_prob	=  (read_qual_r)/3.0;					// proability that some other nucleotide is actual, despite observation. Noninformative distributioon.
			alt_prob		=  read_qual_r;						// This will be distributed according to the Technology specific error distribution, see below.

			effective_reads					= obs_prob*alin_qual_r_i;
			_partialReads[ read_char_i ]	+= alin_qual_r_i;
			_effectiveReads[read_char_i]	+= effective_reads;
			
			// set all elements to background prob
			state_distrib[0] = state_distrib[1] = state_distrib[2] = state_distrib[3] = alt_prob;
			state_distrib[read_char_i]	= obs_prob;							// set "observed" state to perferred prob

			// Deal with technology specific distribution of errors
			// errDist has a 1.0 for the called read, and a probability distribution for the other
			//	values. The some of the non-called read probabilities is 1.0, so this remains normalized.
			state_distrib[0] *= (*(this->_errDist))[read_char_i][0];
			state_distrib[1] *= (*(this->_errDist))[read_char_i][1];
			state_distrib[2] *= (*(this->_errDist))[read_char_i][2];
			state_distrib[3] *= (*(this->_errDist))[read_char_i][3];

			// account for possibility of misalignment & insure normalization
			// this is probability that read was misaligned, thus all observations from this
			// read should be discarded
			state_distrib[4]			= alin_qual_r;													
		};

		_readStates[i].v[0] = state_distrib[0];
		_readStates[i].v[1] = state_distrib[1];
		_readStates[i].v[2] = state_distrib[2];
		_readStates[i].v[3] = state_distrib[3];
		_readStates[i].v[4] = state_distrib[4];
	};
	
	_effectiveReadsSum = _effectiveReads[0];
	for (i=1; i<5;i++) {
		_effectiveReadsSum += _effectiveReads[i]; }

	return;
}


void
ShortReadGenotype::correlated_err_adjust( NucStates *StatesIn, NucStates *StatesOut, double *ThetaAdj, double &Theta, double &ThetaMin, int ReadChar, bool DoAdj ) {
	double	err_prob_in, err_prob_out, err_adj;

	if (!DoAdj) {					// ignore adjustment
		// this is  the most common path
		memcpy( (void *) StatesOut, (void *) StatesIn, sizeof(NucStates) ); 
		return; };

	if (StatesIn->v[4] == 1.0) {	// 100% probability of a misalignment on this read
		// this happens rarely
		memcpy( (void *) StatesOut, (void *) StatesIn, sizeof(NucStates) ); 
		return; };

	err_prob_in		= 1.0 - StatesIn->v[ReadChar];			// sum i=0 to 3 of StatesIn->v[i], = 100%

	err_prob_out	= pow( err_prob_in, *ThetaAdj );		// apply the correlated error adjustment

	err_adj			= err_prob_out / err_prob_in;			// calculate an adjustment for erroinous read distribution

	StatesOut->v[4]			= StatesIn->v[4];				// probability of misalignment for this read
	StatesOut->v[3]			= StatesIn->v[3] * err_adj;		// error distribution is no longer gaurenteed to be uniform...
	StatesOut->v[2]			= StatesIn->v[2] * err_adj;
	StatesOut->v[1]			= StatesIn->v[1] * err_adj;
	StatesOut->v[0]			= StatesIn->v[0] * err_adj;
	StatesOut->v[ReadChar]	= 1.0 - err_prob_out;			// Overwrite value for oberved nucleotide

	(*ThetaAdj) = (*ThetaAdj) * (Theta);					// Adjust Theta for each nucleotide in read.

	// Manual threshold
	if (*ThetaAdj < ThetaMin) *ThetaAdj = ThetaMin;

	return;
}

bool
ShortReadGenotype::SetData(const std::string &ShortReads, const std::string &ShortReadNucQuality, const std::string &ShortReadAlignQuality, double Theta, double ThetaCutoff, bool SortAscending ) {
	long	num_reads;
	double	p_readdata_given_geno;
	double		prob_misseq;
	unsigned long i;
	double		temp_d;
	LogProb		p_data_given_geno, temp_lp;
	NucStates	adj_state;
	const char *nuc_reads, *nuc_readsReadQual, *nuc_readsAlignQual;
	bool		do_err_adjust;	// adjust resutls for correlated errors
	double		theta_eff;

	// Mean SNP rate 7.51*10-4
	// hg18_Symbol = NucCharToIndex( HG18Read );

	// resort reads order by quality.
	do_err_adjust	= (Theta != 1.0);
	num_reads		= (long) ShortReads.size();

	// If theta != 1.0, sort reads and apply theta to all non-canonical reads
	if (!do_err_adjust ) {
		nuc_reads			= ShortReads.c_str();
		nuc_readsReadQual	= ShortReadNucQuality.c_str();
		nuc_readsAlignQual	= ShortReadAlignQuality.c_str();
	} else {
		this->_qualSorter.Initialize( num_reads, ShortReadNucQuality.c_str() );
		// false = low quality to high quality
		this->_qualSorter.Sort( SortAscending, ShortReads.c_str(), ShortReadNucQuality.c_str(), ShortReadAlignQuality.c_str() );
		_qualSorter.FetchBuffs( nuc_reads, nuc_readsReadQual, nuc_readsAlignQual );
	}

	// Turns reads, read quality and alignment_quality characters into mucleotide probability distribution array for each observation.
	//	sets buffer array _readStates, and valid array length _totalReads. Resizes array if needed.
	SetReadStates( num_reads, nuc_reads, nuc_readsReadQual,  nuc_readsAlignQual );

	// Calculate P(Genotype^Data) as P(G|Data) * P(Data)
	for (long genotype_id=0; genotype_id<10; genotype_id++) {
		long read_char_i=4;
		bool do_adj = false;
		// Aggregate number of reads of each type
		p_data_given_geno = 1.0;	// p_data_given_geno is a LogProb, the rest of the intermediate values used in this calculation are doubles.

		theta_eff = 1.0;	// First application is theta^0 = 1.0, second applciation is Theta^1=Theta .. .etc...

		for (long read_num=0; read_num<num_reads; read_num++) {

			if (do_err_adjust) {
				read_char_i = NucCharToIndex(	nuc_reads[read_num] );
				do_adj = false;
				if ( read_char_i < 4 ) {
					do_adj = (SNP_TYPES[read_char_i][genotype_id]>1); };
			};

			correlated_err_adjust( &(_readStates[read_num]), &adj_state, &theta_eff, Theta, ThetaCutoff, read_char_i, do_adj );

			p_readdata_given_geno = 
					COND_PROBS[0][genotype_id] * adj_state.v[0] +
					COND_PROBS[1][genotype_id] * adj_state.v[1] +
					COND_PROBS[2][genotype_id] * adj_state.v[2] +
					COND_PROBS[3][genotype_id] * adj_state.v[3];
			prob_misseq = adj_state.v[4];
			temp_d		= (prob_misseq + p_readdata_given_geno*(1.0 - prob_misseq)); 
			temp_lp		= (LogProb) temp_d;
			p_data_given_geno *= temp_lp; 
			//p_data_given_geno *= (prob_misseq + p_readdata_given_geno*(1.0 - prob_misseq)); 
		}

		// If we were observing an instance of HG18, we'd do it here.
		// save calculated data...
		_prob_DataGivenGenotype[genotype_id] = p_data_given_geno;
	};

	// Now we have P(D | State)
	_prob_Data = 0.0;
	// Calculate P( State ^ Data ) & SUM over states to get P(Data)
	for (i=0; i<10; i++) {
		_prob_DataAndGenotype[i] = _prob_DataGivenGenotype[i] * _diploidPriorsLP[i];
		_prob_Data += _prob_DataAndGenotype[i];}
	//
	// Now calculate P(State | Data) = P(State ^ Data ) / P(Data)
	for (i=0; i<10; i++) {
		_prob_GenotypeGivenData[i] = _prob_DataAndGenotype[i] / _prob_Data; }

	_hasSortedStates = false;

	return( true );
}

void	
ShortReadGenotype::GetSortedState(long i, long &StateID, double &ProbSgD )	{ 
	if (!_hasSortedStates) {
		// Now sort states in order of decreasing posterior probability, for later printout
		for (i=0;i<10;i++) {
			_sortedStates[i].second	= i;
			_sortedStates[i].first	= -_prob_GenotypeGivenData[i]; }

		sort(_sortedStates.begin(), _sortedStates.end() );

		for (i=0;i<10;i++) {
			_sortedStates[i].first	= -_sortedStates[i].first; }

		_hasSortedStates = true; }

	StateID = _sortedStates[i].second;
	ProbSgD = _sortedStates[i].first; 
	return; 
}

long
ShortReadGenotype::GetHomozygousDiploidIndex( long NucIndex ){
	long out;

	switch( NucIndex ) {
		case 0: out = 0; break;		// 'A' -> 'AA'
		case 1: out = 4; break;		// 'C' -> 'CC'
		case 2: out = 7; break;		// 'G' -> 'GG'
		case 3: out = 9; break;		// 'T' -> 'TT'
		default:
			// TODO: abort, this is an error!
			out = 10; break; }
	return( out );
}

long
ShortReadGenotype::NucCharToIndex(char a) {
	long au = toupper( a );
	long out;

	switch(au) {
		case 'A':	out = 0; break; 
		case 'C':	out = 1; break; 
		case 'G':	out = 2; break; 
		case 'T':	out = 3; break; 
		case 'N':	out = 4; break; 
		default:	out = 4; break; }

	return( out);
}


 
double	
ShortReadGenotype::QualCharToReal( char A ) {
	double out;
	long v1 = A - '!';
	out = ShortReadGenotype::QualScoreToReal( v1 );
	return( out );
}

double
ShortReadGenotype::QualScoreToReal( long A ) {
	double out;
	out = ((double)A / -10.0);
	out = pow(10,out);
	return( out );
}


#define SAFE_FREE( a ) { if ( (a) != NULL) { free( (a) ); (a) = NULL; }; }
#define SAFE_REALLOC( a, b, c  ) { SAFE_FREE( a ); a = (c *) calloc( b , sizeof(c) ); }
bool
IndexRecord::InsureSize( long NewSize ) {

	if (NewSize < 0 ) {
		// Free the buffers
		_numAllocated = 0;
		SAFE_FREE( _items );
		SAFE_FREE( _bufNucs );
		SAFE_FREE( _bufQualRead );
		SAFE_FREE( _bufQualAlign );
		return( true ); }

	if (NewSize < 16) NewSize = 16;	// Insure a reasonable minimum

	// if our buffer is big enough, YAY! Just exit.
	if (NewSize<_numAllocated) return( true );

	// Otherwise realloc top a size larger than NewSize, 2x seems nice, and prevents
	//	frequent expensive reallcoation for growing data set sizes...
	// save 1 byte for null termination!
	_numAllocated = 0;
	SAFE_REALLOC( _items,			NewSize*2, IndexRecordItem );
	SAFE_REALLOC( _bufNucs,			NewSize*2, char );
	SAFE_REALLOC( _bufQualRead,		NewSize*2, char );
	SAFE_REALLOC( _bufQualAlign,	NewSize*2, char );
	if ( !_items || !_bufNucs || !_bufQualRead || !_bufQualAlign) return( false );	// this is a terminal error.
	_numAllocated = NewSize*2;

	return( true );
}

bool
IndexRecord::Initialize( long NumItems, const char *DataSource ){
	if (!InsureSize( NumItems )) return( false );
	_numActive = NumItems;
	for (long i =0; i<NumItems; i++) {
		_items[i].SourceArray	= DataSource;
		_items[i].index			= i; };
	return( true );
}

int compare_up( const void *Item1, const void *Item2 ) {
	const IndexRecord::IndexRecordItem *i1 = (IndexRecord::IndexRecordItem *) Item1;
	const IndexRecord::IndexRecordItem *i2 = (IndexRecord::IndexRecordItem *) Item2;
	if (Item1 == Item2 ) return( 0 );	// DO we need this?
	if (i1->SourceArray[ i1->index ] > i2->SourceArray[ i2->index ]) return( 1 );
	if (i1->SourceArray[ i1->index ] < i2->SourceArray[ i2->index ]) return( -1 );
	if (i1->index < i2->index ) return( 1 );	// this insures that the order is total, break ties.
	return( -1 );
}

int compare_down( const void *Item1, const void *Item2 ) {
	const IndexRecord::IndexRecordItem *i1 = (IndexRecord::IndexRecordItem *) Item1;
	const IndexRecord::IndexRecordItem *i2 = (IndexRecord::IndexRecordItem *) Item2;
	if (Item1 == Item2 ) return( 0 );	// DO we need this?
	if (i1->SourceArray[ i1->index ] < i2->SourceArray[ i2->index ]) return( 1 );
	if (i1->SourceArray[ i1->index ] > i2->SourceArray[ i2->index ]) return( -1 );
	if (i1->index > i2->index ) return( 1 );	// this insures that the order is total, break ties.
	return( -1 );
}

bool
IndexRecord::Sort( bool Ascending, const char *Nucs, const char *ReadQual, const char *AlignQual ) {
	long tmp_index;

	// generate sorted indices.
	if (Ascending) {
		qsort( (void *) _items, _numActive, sizeof( IndexRecordItem ), compare_up );
	} else { 
		qsort( (void *) _items, _numActive, sizeof( IndexRecordItem ), compare_down );
	}

	for (long i=0; i<_numActive; i++) {
		tmp_index			= _items[i].index;			// array of sorted indicex
		_bufNucs[i]			= Nucs[tmp_index];			// get sorted values from sorted indices
		_bufQualRead[i]		= ReadQual[tmp_index];
		_bufQualAlign[i]	= AlignQual[tmp_index]; }

	// remember null termination!
	_bufNucs[_numActive]		= 0;			
	_bufQualRead[_numActive]	= 0;
	_bufQualAlign[_numActive]	= 0;

	return( true );
}
	
 
