Implementation of a homogeneous Markov model of order 0 based on AbstractModel: Difference between revisions

From Jstacs
Jump to navigationJump to search
(New page: <source lang="java"> import java.util.Arrays; import de.jstacs.NonParsableException; import de.jstacs.NotTrainedException; import de.jstacs.data.AlphabetContainer; import de.jstacs.data.S...)
 
No edit summary
 
(2 intermediate revisions by 2 users not shown)
Line 16: Line 16:
public class HomogeneousMarkovModel extends AbstractModel {
public class HomogeneousMarkovModel extends AbstractModel {


//array for the parameters, i.e. the probabilities for each symbol
private double[] logProbs;//array for the parameters, i.e. the probabilities for each symbol
private double[] probs;
private boolean isTrained;//stores if the model has been trained
//stores if the model has been trained
 
private boolean isTrained;
public HomogeneousMarkovModel( AlphabetContainer alphabets ) throws Exception {
public HomogeneousMarkovModel( AlphabetContainer alphabets ) throws Exception {
//we have a homogeneous Model, hence the length is set to 0
super( alphabets, 0 ); //we have a homogeneous Model, hence the length is set to 0
super( alphabets, 0 );
//a homogeneous Model can only handle simple alphabets
//a homogeneous Model can only handle simple alphabets
if(! (alphabets.isSimple() && alphabets.isDiscrete()) ){
if(! (alphabets.isSimple() && alphabets.isDiscrete()) ){
Line 29: Line 26:
}
}
//initialize parameter array
//initialize parameter array
this.probs = new double[(int) alphabets.getAlphabetLengthAt( 0 )];
this.logProbs = new double[(int) alphabets.getAlphabetLengthAt( 0 )];
//we have not trained the model, yet
isTrained = false; //we have not trained the model, yet
isTrained = false;
}
}


public HomogeneousMarkovModel( StringBuffer stringBuff ) throws NonParsableException {
public HomogeneousMarkovModel( StringBuffer stringBuff ) throws NonParsableException {  
super( stringBuff );
            super( stringBuff );  
}
        }


@Override
protected void fromXML( StringBuffer xml ) throws NonParsableException {
protected void fromXML( StringBuffer xml ) throws NonParsableException {
//extract our XML-code
//extract our XML-code
Line 45: Line 40:
alphabets = (AlphabetContainer) XMLParser.extractStorableForTag( xml, "alphabets" );
alphabets = (AlphabetContainer) XMLParser.extractStorableForTag( xml, "alphabets" );
length = XMLParser.extractIntForTag( xml, "length" );
length = XMLParser.extractIntForTag( xml, "length" );
probs = XMLParser.extractDoubleArrayForTag( xml, "probs" );
logProbs = XMLParser.extractDoubleArrayForTag( xml, "logProbs" );
isTrained = XMLParser.extractBooleanForTag( xml, "isTrained" );
isTrained = XMLParser.extractBooleanForTag( xml, "isTrained" );
}
}
Line 54: Line 49:
XMLParser.appendStorableWithTags( buf, alphabets, "alphabets" );
XMLParser.appendStorableWithTags( buf, alphabets, "alphabets" );
XMLParser.appendIntWithTags( buf, length, "length" );
XMLParser.appendIntWithTags( buf, length, "length" );
XMLParser.appendDoubleArrayWithTags( buf, probs, "probs" );
XMLParser.appendDoubleArrayWithTags( buf, logProbs, "logProbs" );
XMLParser.appendBooleanWithTags( buf, isTrained, "isTrained" );
XMLParser.appendBooleanWithTags( buf, isTrained, "isTrained" );
//add our own tag
//add our own tag
Line 61: Line 56:
}
}


public String getInstanceName() {
public String getInstanceName() {  
return "Homogeneous Markov model of order 0";
            return "Homogeneous Markov model of order 0";  
}
        }


public double getLogPriorTerm() throws Exception {
public double getLogPriorTerm() throws Exception {  
//we use ML-estimation, hence no prior term
            //we use ML-estimation, hence no prior term
return 0;
            return 0;  
}
        }  


public NumericalResultSet getNumericalCharacteristics() throws Exception {
public NumericalResultSet getNumericalCharacteristics() throws Exception {
//we do not have much to tell here
//we do not have much to tell here
return new NumericalResultSet(new NumericalResult("Number of parameters","The number of parameters this model uses",probs.length));
return new NumericalResultSet(new NumericalResult("Number of parameters","The number of parameters this model uses",logProbs.length));
}
}


public double getProbFor( Sequence sequence, int startpos, int endpos ) throws NotTrainedException, Exception {
public double getLogProbFor( Sequence sequence, int startpos, int endpos ) throws NotTrainedException, Exception {
double seqProb = 1.0;
double seqLogProb = 0.0;
//compute the probability of the sequence between startpos and endpos (inclusive)
//compute the log-probability of the sequence between startpos and endpos (inclusive)
//as product of the single symbol probabilities
//as sum of the single symbol log-probabilities
for(int i=startpos;i<=endpos;i++){
for(int i=startpos;i<=endpos;i++){
//directly access the array by the numerical representation of the symbols
//directly access the array by the numerical representation of the symbols
seqProb *= probs[sequence.discreteVal( i )];
seqLogProb += logProbs[sequence.discreteVal( i )];
}
}
return seqProb;
return seqLogProb;
}
public double getProbFor( Sequence sequence, int startpos, int endpos ) throws NotTrainedException, Exception {
return Math.exp( getLogProbFor(sequence, startpos, endpos) );
}
}


public boolean isTrained() {
public boolean isTrained() {  
return isTrained;
            return isTrained;  
}
        }


public void train( Sample data, double[] weights ) throws Exception {
public void train( Sample data, double[] weights ) throws Exception {
//if we do not have any weights, we create some,
//giving all sequences the same weight
if(weights == null){
weights = new double[data.getNumberOfElements()];
Arrays.fill( weights, 1.0 );
}
//reset the parameter array
//reset the parameter array
Arrays.fill( probs, 0.0 );
Arrays.fill( logProbs, 0.0 );
//default sequence weight
double w = 1;
//for each sequence in the data set
//for each sequence in the data set
for(int i=0;i<data.getNumberOfElements();i++){
for(int i=0;i<data.getNumberOfElements();i++){
//retrieve sequence
//retrieve sequence
Sequence seq = data.getElementAt( i );
Sequence seq = data.getElementAt( i );
//if we do have any weights, use them
if(weights != null){
w = weights[i];
}
//for each position in the sequence
//for each position in the sequence
for(int j=0;j<seq.getLength();j++){
for(int j=0;j<seq.getLength();j++){
//count symbols, weighted by weights
//count symbols, weighted by weights
probs[ seq.discreteVal( j ) ] += weights[i];
logProbs[ seq.discreteVal( j ) ] += w;
}
}
}
}
//compute normalization
//compute normalization
double norm = 0.0;
double norm = 0.0;
for(int i=0;i<probs.length;i++){
for(int i=0;i<logProbs.length;i++){ norm += logProbs[i]; }
norm += probs[i];
}
//normalize probs to obtain proper probabilities
//normalize probs to obtain proper probabilities
for(int i=0;i<probs.length;i++){
for(int i=0;i<logProbs.length;i++){ logProbs[i] = Math.log( logProbs[i]/norm ); }
probs[i] /= norm;
}
//now the model is trained
//now the model is trained
isTrained = true;
isTrained = true;

Latest revision as of 13:37, 5 September 2008

import java.util.Arrays;

import de.jstacs.NonParsableException;
import de.jstacs.NotTrainedException;
import de.jstacs.data.AlphabetContainer;
import de.jstacs.data.Sample;
import de.jstacs.data.Sequence;
import de.jstacs.io.XMLParser;
import de.jstacs.models.AbstractModel;
import de.jstacs.results.NumericalResult;
import de.jstacs.results.NumericalResultSet;



public class HomogeneousMarkovModel extends AbstractModel {

	private double[] logProbs;//array for the parameters, i.e. the probabilities for each symbol
	private boolean isTrained;//stores if the model has been trained

	public HomogeneousMarkovModel( AlphabetContainer alphabets ) throws Exception {
		super( alphabets, 0 ); //we have a homogeneous Model, hence the length is set to 0
		//a homogeneous Model can only handle simple alphabets
		if(! (alphabets.isSimple() && alphabets.isDiscrete()) ){
			throw new Exception("Only simple and discrete alphabets allowed");
		}
		//initialize parameter array
		this.logProbs = new double[(int) alphabets.getAlphabetLengthAt( 0 )];
		isTrained = false; //we have not trained the model, yet
	}

	public HomogeneousMarkovModel( StringBuffer stringBuff ) throws NonParsableException { 
            super( stringBuff ); 
        }

	protected void fromXML( StringBuffer xml ) throws NonParsableException {
		//extract our XML-code
		xml = XMLParser.extractForTag( xml, "homogeneousMarkovModel" );
		//extract all the variables using XMLParser
		alphabets = (AlphabetContainer) XMLParser.extractStorableForTag( xml, "alphabets" );
		length = XMLParser.extractIntForTag( xml, "length" );
		logProbs = XMLParser.extractDoubleArrayForTag( xml, "logProbs" );
		isTrained = XMLParser.extractBooleanForTag( xml, "isTrained" );
	}

	public StringBuffer toXML() {
		StringBuffer buf = new StringBuffer();
		//pack all the variables using XMLParser
		XMLParser.appendStorableWithTags( buf, alphabets, "alphabets" );
		XMLParser.appendIntWithTags( buf, length, "length" );
		XMLParser.appendDoubleArrayWithTags( buf, logProbs, "logProbs" );
		XMLParser.appendBooleanWithTags( buf, isTrained, "isTrained" );
		//add our own tag
		XMLParser.addTags( buf, "homogeneousMarkovModel" );
		return buf;
	}

	public String getInstanceName() { 
            return "Homogeneous Markov model of order 0"; 
        }

	public double getLogPriorTerm() throws Exception { 
            //we use ML-estimation, hence no prior term
            return 0; 
        } 

	public NumericalResultSet getNumericalCharacteristics() throws Exception {
		//we do not have much to tell here
		return new NumericalResultSet(new NumericalResult("Number of parameters","The number of parameters this model uses",logProbs.length));
	}

	public double getLogProbFor( Sequence sequence, int startpos, int endpos ) throws NotTrainedException, Exception {
		double seqLogProb = 0.0;
		//compute the log-probability of the sequence between startpos and endpos (inclusive)
		//as sum of the single symbol log-probabilities
		for(int i=startpos;i<=endpos;i++){
			//directly access the array by the numerical representation of the symbols
			seqLogProb += logProbs[sequence.discreteVal( i )];
		}
		return seqLogProb;
	}
	
	public double getProbFor( Sequence sequence, int startpos, int endpos ) throws NotTrainedException, Exception {
		return Math.exp( getLogProbFor(sequence, startpos, endpos) );
	}

	public boolean isTrained() { 
            return isTrained; 
        }

	public void train( Sample data, double[] weights ) throws Exception {
		//reset the parameter array
		Arrays.fill( logProbs, 0.0 );
		//default sequence weight
		double w = 1;
		//for each sequence in the data set
		for(int i=0;i<data.getNumberOfElements();i++){
			//retrieve sequence
			Sequence seq = data.getElementAt( i );
			//if we do have any weights, use them
			if(weights != null){
				w = weights[i];
			}
			//for each position in the sequence
			for(int j=0;j<seq.getLength();j++){
				//count symbols, weighted by weights
				logProbs[ seq.discreteVal( j ) ] += w;
			}
		}
		//compute normalization
		double norm = 0.0;
		for(int i=0;i<logProbs.length;i++){ norm += logProbs[i]; }
		//normalize probs to obtain proper probabilities
		for(int i=0;i<logProbs.length;i++){ logProbs[i] = Math.log( logProbs[i]/norm ); }
		//now the model is trained
		isTrained = true;
	}

}