/////////////////////////////////////////////////////////////////////// // VerbLearn System // // A crosslinguistic bodily-grounded hand-action // verb semantics acquisition system. // // David Bailey 1/97 // Converted from Sather 1.1 // // File: Lexicon.java // // Description: Core classes for lexical semantic representation and learning. // // Classes: Lexicon, Slot, Word, Sense, PFstruct, Dist, Fstruct import java.io.*; import java.util.*; /////////////////////////////////////////////////////////////////////// // A multi-slot collection of word meanings, with functionality for // labelling, obeying, and learning. class Lexicon implements Serializable { private Slot[] slots; private String id; Lexicon() { id = "New/Unnamed"; slots = new Slot[VerbLearn.context().numSlots()]; for (int i = 0; i < VerbLearn.context().numSlots(); i++) { slots[i] = new Slot(i); } } // (Alternately, can be constructed by loading a serialized object.) public String id() { return id; } public void setId(String newId) { id = newId; } public Slot slot(int i) { return slots[i]; } // Returns the Word for 'w' (from the left-most slot if it appears multiply). public Word getWord(String w) { for (int i = 0; i < slots.length; i++) { Word word = slots[i].get(w); if (word != null) return word; } return null; } // Modify self to reflect a new labelled action. public void incorporate(Fstruct f, VerbComplex vc) { // Optional (TBD): Remove features explained by well-learned // words in the verb complex. // Incorporate remaining features paired with the verb complex // (TBD: include words which caused explaining-away above?) for (int i = 0; i < slots.length; i++) { if (vc.slotFilled(i)) { slots[i].incorporate(f, vc.getWord(i)); } } } // Performs a merge loop if there have been any actions incorporated // since the last merge loop. Used to "clean up" at end of training. public void synch() { for (int i = 0; i < slots.length; i++) { slots[i].synch(); } } // Given a linking Fstruct, return the best-matching (multi-slot) label. public VerbComplex label(Fstruct f) { // Copy f so that features only get explained away in the local context! f = new Fstruct(f); VerbComplex vc = new VerbComplex(VerbLearn.context().numSlots()); int numFilled = 0; CandidateSense[] candidates = new CandidateSense[VerbLearn.context().numSlots()]; // Repeatedly get the best fillers for each unfilled slot and choose // the best one if its posterior >= minLabel: while (true) { // Get the best label for each unfilled slot: for (int i = 0; i < slots.length; i++) { candidates[i] = null; if (vc.getWord(i) == null) { candidates[i] = slots[i].label(f); } } // Find the slot with MAP label: int winningSlot = -1; for (int i = 0; i < slots.length; i++) { if (vc.getWord(i) == null && candidates[i] != null && (winningSlot == -1 || candidates[i].posterior >= candidates[winningSlot].posterior)) { winningSlot = i; } } // If the match is good enough, apply the label. // Note this allows the possibility of producing an empty label // meaning "I can't describe this". if (winningSlot == -1 || candidates[winningSlot].posterior < VerbLearn.context().params().minLabel()) { break; } vc.setWord(winningSlot, candidates[winningSlot].label); numFilled += 1; if (numFilled == VerbLearn.context().numSlots()) { break; } // Remove explained features from `f' before we go back and // embellish our label by filling further slots. f.explainAway(candidates[winningSlot].sense); } // Finally, return the verb complex: return vc; } // A more detailed version of label(), this routine returns the // posterior probabilities on all senses in the lexicon. public String allSensePosteriors(Fstruct f) { StringBuffer res = new StringBuffer(); for (int i = 0; i < slots.length; i++) { Vector cands = slots[i].allSensePosteriors(f); Enumeration cEnum = cands.elements(); while (cEnum.hasMoreElements()) { res.append((CandidateSense)cEnum.nextElement()); } } return res.toString(); } // Return a linking Fstruct compatible with a command and initial // state linking features. public Fstruct obey(VerbComplex cmd, Fstruct state) { // Conflict avoidance: Choose minimally conflicting senses of the // various words in the command. For now this is done by choosing // the sense for each word which best matches the initial world state. // There is no inter-word conflict avoidance. Sense[] winningSenses = new Sense[slots.length]; for (int i = 0; i < slots.length; i++) { winningSenses[i] = slots[i].obey(cmd.getWord(i), state); } // Conflict resolution: Merge the chosen senses into a linking Fstruct. // For each motoric linking feature, choose MAP value from most // peaked distribution. (This is actually implemented in Fstruct.) return new Fstruct(winningSenses, state); } public String toString() { StringBuffer res = new StringBuffer(); res.append("Lexicon " + id + " (" + totalSenses() + " senses):\n"); for (int i = 0; i < slots.length; i++) { res.append(slots[i]); } return res.toString(); } public int totalSenses() { int res = 0; for (int i = 0; i < slots.length; i++) { res += slots[i].totalSenses(); } return res; } } /////////////////////////////////////////////////////////////////////// // Contains all words occurring in a single grammatical position, or // "slot". Also contains a local copy of the number of virtual samples // to use for each feature, to allow slot-specific adaption. class Slot implements Serializable { private int id; public int id() { return id; } private Hashtable words; // maps String version to its Word object public Hashtable virtuals; // maps String feature names to Double private double numEx; // number of training examples in this slot Slot(int id) { this.id = id; words = new Hashtable(); virtuals = VerbLearn.context().params().copyOfVirtuals(); } // Return Word corresponding to 'w', or null if none exists. public Word get(String w) { return (Word)words.get(w); } public String[] wordList() { String[] wl = new String[words.size()]; int i = 0; Enumeration kEnum = words.keys(); while (kEnum.hasMoreElements()) { wl[i] = (String)kEnum.nextElement(); i = i + 1; } return wl; } // Update slot to include a new training instance, adding a new Word // if necessary. public void incorporate(Fstruct f, String w) { numEx++; // Add a new Word if necessary: if (!words.containsKey(w)) { words.put(w, new Word(w, virtuals, this)); } // Modify the Word to include the new usage: boolean ranMerge = ((Word)words.get(w)).incorporate(f); // Update the slot-specific virtual samples: if (VerbLearn.context().params().adaptVirtuals() && ranMerge) { adaptVirtuals(); } } public void synch() { Enumeration wEnum = words.elements(); while (wEnum.hasMoreElements()) { ((Word)wEnum.nextElement()).synch(); } if (VerbLearn.context().params().adaptVirtuals()) { adaptVirtuals(); } } // For each feature, do a weighted average of (1) the current // settings, and (2) the numEx-averaged value of v = (1-p)/(pn-1) // across all senses (where n is the # of values, and p is the // probability of the mode value). private void adaptVirtuals() { // Iteratively adapt each feature: Enumeration fEnum = virtuals.keys(); while (fEnum.hasMoreElements()) { String fname = (String)fEnum.nextElement(); double vAvg = 0; double count = 0; // Double loop to average mode's probability over all senses in slot: Enumeration wEnum = words.elements(); while (wEnum.hasMoreElements()) { Enumeration sEnum = ((Word)wEnum.nextElement()).senseEnum(); while (sEnum.hasMoreElements()) { Sense s = (Sense)sEnum.nextElement(); double p = s.getDist(fname).likelihood(s.getDist(fname).mode()); // Avoid infinity when adaptVirtuals encounters a uniform dist: vAvg = vAvg + s.numEx() * Math.min(VerbLearn.context().params().maxVirtual(), (1 - p) / (p * VerbLearn.context().params().featureVals(fname).size() - 1)); count = count + s.numEx(); } } // Weight the above against the current virtuals setting: vAvg = vAvg + VerbLearn.context().params().virtualInertia() * ((Double)virtuals.get(fname)).doubleValue(); count = count + VerbLearn.context().params().virtualInertia(); vAvg = vAvg / count; // Finally set the new value: virtuals.put(fname, new Double(vAvg)); } } public CandidateSense label(Fstruct f) { CandidateSense leading = new CandidateSense(); leading.posterior = -1; Enumeration wEnum = words.elements(); while (wEnum.hasMoreElements()) { Word w = (Word)wEnum.nextElement(); CandidateSense cur = w.bestSenseFor(f, numEx); if (cur.posterior > leading.posterior) { leading = cur; leading.label = w.id(); } } return leading; } // Return posteriors of all word senses. public Vector allSensePosteriors(Fstruct f) { Vector res = new Vector(); // Vector Enumeration wEnum = words.elements(); while (wEnum.hasMoreElements()) { Vector cands = ((Word)wEnum.nextElement()).allSensePosteriors(f, numEx); Enumeration cEnum = cands.elements(); while (cEnum.hasMoreElements()) { res.addElement((CandidateSense)cEnum.nextElement()); } } return res; } // Return the Sense for word "cmd" which best fits the world "state", // or null if the best fit's posterior isn't >= minObey or if // the word is unknown. public Sense obey(String cmd, Fstruct state) { if (cmd == null) return null; Word w = (Word)words.get(cmd); if (w == null) return null; CandidateSense cand = w.bestSenseFor(state, w.numEx()); if (cand.posterior >= VerbLearn.context().params().minObey()) { return cand.sense; } else { return null; } } public String toString() { StringBuffer res = new StringBuffer(); res.append("Slot ").append(id).append(" (").append(totalSenses()) .append(" senses) virtuals: "); Enumeration fEnum = VerbLearn.context().params().featureNames(); boolean firstTime = true; while (fEnum.hasMoreElements()) { String fname = (String)fEnum.nextElement(); if (firstTime) { firstTime = false; } else { res.append(" "); } res.append(fname).append("=") .append(Fmt.dbl(((Double)virtuals.get(fname)).doubleValue())); } res.append("\n"); return res.toString(); } public int totalSenses() { int res = 0; Enumeration wEnum = words.elements(); while (wEnum.hasMoreElements()) { res += ((Word)wEnum.nextElement()).totalSenses(); } return res; } } /////////////////////////////////////////////////////////////////////// // Representation of a word. Mainly consists of a set of senses. class Word implements Serializable { public static final double LN_TO_LOG = Math.log(10); private String id; private Vector senses; // Vector private PriorityQueue candidateMerges; // ordered by similarity private int backlog; // number of examples incorporated since last merge private Vector examples; // Vector stores training data for word, // used in computing likelihoods // Unfortunately keeping this is implausible. private int nextSenseNumber; // used to give unique names to senses private Slot slot; // pointer to parent, from which we need to READ // (1) slot number, and (2) virtuals table public String id() { return id; } public double numEx() { return examples.size(); } Word(String w, Hashtable v, Slot s) { id = w; senses = new Vector(); candidateMerges = new PriorityQueue(); examples = new Vector(); nextSenseNumber = 1; slot = s; } // Incorporate a new training instance, by adding a new sense for it // and possibly performing some merging (returns true iff so). public boolean incorporate(Fstruct f) { examples.addElement(f); Sense s = new Sense(id + nextSenseNumber, f, slot.virtuals); VerbLearn.trainLog().append("Creating new sense " + s); nextSenseNumber++; backlog++; senses.addElement(s); addNewCandidateMerges(s); if (backlog >= VerbLearn.context().params().batchSize()) { synch(); return true; } else { return false; } } public void synch() { mergeUntilLocalMax(); backlog = 0; } // Find best merge candidates and perform the merge, repeating until // no satisfactory merge candidates are found or the posterior drops. private void mergeUntilLocalMax() { // Determine the posterior for the current Word model: double logPrior = logPrior(); double logLike = logLikelihoodOfDataset(); double oldLogPost = logPrior + logLike; double newLogPost = 0; // this initialization is only to satisfy compiler double rate; Sense newSense = null; if (VerbLearn.context().params().testRecognition()) { rate = recognitionRate(); String rateStr; if (rate >= 0) { rateStr = String.valueOf((int)(rate * 100)) + "%"; } else { rateStr = "N/A"; } VerbLearn.trainLog().append("Starting merging for " + id + " (log prior=" + Fmt.dbl(logPrior) + ", log likelihood=" + Fmt.dbl(logLike) + ", log posterior=" + Fmt.dbl(oldLogPost) + ", recog rate=" + rateStr + "):\n"); } else { VerbLearn.trainLog().append("Starting merging for " + id + " (log prior=" + Fmt.dbl(logPrior) + ", log likelihood=" + Fmt.dbl(logLike) + ", log posterior=" + Fmt.dbl(oldLogPost) + "):\n"); } // Repeatedly find and commit merges until no acceptable merge can // be found: while (true) { CandidateMerge cand; boolean commitMerge = false; int failedTries = 0; Vector putBackList = new Vector(); // Vector // Try out merges until there are no remaining "plausible" // (adequate similarity) merges on the queue, or MaxTries // tentative merges have failed to raise the posterior: while (true) { // Pick the highest-similarity candidate merge if such exists // and see if it's plausible enough: cand = (CandidateMerge)candidateMerges.remove(); if (cand == null || cand.similarity < VerbLearn.context().params().minMerge()) { if (senses.size() == 1) { VerbLearn.trainLog().append("Stopped merging because only " + "one sense remains.\n"); } else { VerbLearn.trainLog().append("Stopped merging because no (more) " + "plausible merges were found.\n"); } break; } // Try out the candidate merge: newSense = merge(cand); logPrior = logPrior(); logLike = logLikelihoodOfDataset(); newLogPost = logPrior + logLike; if (VerbLearn.context().params().testRecognition()) { rate = recognitionRate(); String rateStr; if (rate >= 0) { rateStr = String.valueOf((int)(rate * 100)) + "%"; } else { rateStr = "N/A"; } VerbLearn.trainLog().append("New " + id + " log prior=" + Fmt.dbl(logPrior) + ", log likelihood=" + Fmt.dbl(logLike) + ", log posterior=" + Fmt.dbl(newLogPost)+ ", recog rate=" + rateStr + ".\n"); } else { VerbLearn.trainLog().append("New " + id + " log prior=" + Fmt.dbl(logPrior) + ", log likelihood=" + Fmt.dbl(logLike) + ", log posterior=" + Fmt.dbl(newLogPost)+ ".\n"); } // If an improvement, indicate the merge should be committed, // otherwise undo it and see if we should try again: if (newLogPost >= oldLogPost) { commitMerge = true; break; } else { unmerge(cand, newSense); putBackList.addElement(cand); failedTries++; if (failedTries == VerbLearn.context().params().maxTries()) { VerbLearn.trainLog() .append("Stopped merging because the " + (VerbLearn.context().params().maxTries() > 1 ? (String.valueOf(VerbLearn.context().params() .maxTries()) + " most plausible merges ") : "most plausible merge ") + "reduced the posterior.\n"); break; } } } // Restore bum merges since they might (?) work out in the future: for (int i = 0; i < putBackList.size(); i++) { CandidateMerge cm = (CandidateMerge)putBackList.elementAt(i); candidateMerges.insert(cm, cm.similarity); } // Commit the Word model to the most-recent merge if indicated, // else terminate the outer merging loop: if (commitMerge) { purgeCandidateMerges(cand); addNewCandidateMerges(newSense); oldLogPost = newLogPost; if (failedTries > 0) { VerbLearn.trainLog().append("*** AN EXAMPLE OF KEEPING A " + "NON-TOP-RANKED MERGE ***"); } } else { break; } } } // Returns new sense to allow caller to undo merge with unmerge(). private Sense merge(CandidateMerge cand) { senses.removeElement(cand.s1); senses.removeElement(cand.s2); // Note: uses the current virtual samples for the new sense: Sense newSense = new Sense(cand.s1, cand.s2, slot.virtuals, id + nextSenseNumber); nextSenseNumber++; senses.addElement(newSense); VerbLearn.trainLog().append("Merging " + cand.s1.id() + " and " + cand.s2.id() + " (similarity " + Fmt.dbl(cand.similarity) + ") to form " + newSense); return newSense; } // Intended for use when a merge lowers the posterior of the model. private void unmerge(CandidateMerge cand, Sense newSense) { senses.removeElement(newSense); senses.addElement(cand.s1); senses.addElement(cand.s2); VerbLearn.trainLog().append("Posterior decreased; removing " + newSense.id() + " and restoring " + cand.s1.id() + " and " + cand.s2.id() + ".\n"); } // Adds to candidateMerges all the new potential merges involving a // new Sense. private void addNewCandidateMerges(Sense newSense) { Enumeration sEnum = senses.elements(); while (sEnum.hasMoreElements()) { Sense s = (Sense)sEnum.nextElement(); if (s != newSense) { CandidateMerge cand = new CandidateMerge(newSense, s, newSense.similarityTo(s)); candidateMerges.insert(cand, cand.similarity); } } } // Removes from candidateMerges all potential merges involving either // sense contained in 'merged'. private void purgeCandidateMerges(CandidateMerge merged) { // THIS IS REALLY GROSS AND SHOULD BE REPLACED BY SOMETHING EFFICIENT: PriorityQueue pq = new PriorityQueue(); while (!candidateMerges.isEmpty()) { CandidateMerge cand = (CandidateMerge)candidateMerges.remove(); if (cand.s1 != merged.s1 && cand.s1 != merged.s2 && cand.s2 != merged.s1 && cand.s2 != merged.s2) { pq.insert(cand, cand.similarity); } } candidateMerges = pq; } // Prior on current word model. // Currently we use an exponentially decreasing function of // the number of word senses. // PERHAPS USE ANDREAS' IDEA TO INCREASE PRIOR WEIGHT DURING TRAINING // TO AVOID EARLY OVER-MERGING? // Return the base-10 log of this prior. private double logPrior() { double w = VerbLearn.context().params().modelPriorWeight(); double normalization = Math.exp(w) * (1 - Math.exp(-w)); double prior = normalization * Math.exp(-w * senses.size()); return Math.log(prior) / LN_TO_LOG; } // Likelihood of the set of examples given the word model. // Uses log base 10. private double logLikelihoodOfDataset() { double res = 0; Enumeration exEnum = examples.elements(); while (exEnum.hasMoreElements()) { res = res + Math.log(likelihood((Fstruct)exEnum.nextElement())) / LN_TO_LOG; } return res; } // Sums the likelihoods across all senses, weighted by frequency of // each sense. // TBD: SHOULD FREQUENCY BE USED ONLY IF USEWORDFREQUENCIES IS ENABLED? private double likelihood(Fstruct f) { double res = 0; Enumeration sEnum = senses.elements(); while (sEnum.hasMoreElements()) { Sense s = (Sense)sEnum.nextElement(); res = res + (s.numEx() / numEx()) * s.likelihood(f); } return res; } // Test recognition results ONLY for this word, and report the // fraction which are correctly labelled. // If there are NO examples to test, a NEGATIVE NUMBER is returned. public double recognitionRate() { // THIS CODE IS ADAPTED FROM VERBLEARN.RECOGNIZE() int numCorrect = 0; int numTested = 0; Enumeration scEnum = VerbLearn.context().dataset().recognizeEnum(); while (scEnum.hasMoreElements()) { Scenario sc = VerbLearn.context().scenarios().get( (String)scEnum.nextElement()); // ONLY TEST EXAMPLES OF THIS CURRENT WORD if (sc.isLabelled() && sc.label().getWord(slot.id()).equals(id)) { String actual = VerbLearn.context().lexicon().label(sc.finalLink()) .toString(); numTested++; if (actual.equals(sc.label().toString())) { numCorrect++; } } } if (numTested > 0) { return (double)numCorrect / (double)numTested; } else { return -1; } } // Return the closest matching Sense to 'f'. The 'posterior' field // includes a prior reflecting the relative frequency of the sense // relative to the 'divisor' parameter, which should be the Slot's // total number of examples (for labelling) or the Word's total // number of examples (for obeying). Also note: the label field of // the return value is unused. // THIS ROUTINE AND ITS RETURN TYPE ARE GETTING TOO OVERLOADED. public CandidateSense bestSenseFor(Fstruct f, double divisor) { CandidateSense leading = new CandidateSense(); leading.posterior = -1; Enumeration sEnum = senses.elements(); while (sEnum.hasMoreElements()) { Sense s = (Sense)sEnum.nextElement(); double posterior; if (VerbLearn.context().params().useWordFrequencies()) { posterior = (s.numEx() / divisor) * s.likelihood(f); } else { posterior = s.likelihood(f); } if (posterior > leading.posterior) { leading.sense = s; leading.posterior = posterior; } } return leading; } // Return posteriors of all word senses as a Vector. // 'divisor' should be the Slot's total number of examples. public Vector allSensePosteriors(Fstruct f, double divisor) { Vector res = new Vector(); // Vector Enumeration sEnum = senses.elements(); while (sEnum.hasMoreElements()) { Sense s = (Sense)sEnum.nextElement(); double pr; if (VerbLearn.context().params().useWordFrequencies()) { pr = (s.numEx() / divisor); } else { pr = 1; } double li = s.likelihood(f); res.addElement(new CandidateSense(null, s, pr, li, pr * li)); } return res; } public Enumeration senseEnum() { return senses.elements(); } public String toString() { StringBuffer res = new StringBuffer(); res.append("Model for ").append(id).append(" (").append(senses.size()) .append(" sense"); if (senses.size() > 1) res.append("s"); res.append("):\n"); Enumeration sEnum = senses.elements(); while (sEnum.hasMoreElements()) { res.append("\n").append(sEnum.nextElement()); } return res.toString(); } public int totalSenses() { return senses.size(); } } /////////////////////////////////////////////////////////////////////// // A sense of a word. Basically just a Probabilistic Fstruct (PFstruct). class Sense extends PFstruct implements Serializable { private String id; private int numEx; // number of examples "explained" by this sense public String id() { return id; } public int numEx() { return numEx; } // For turning an instance into an initial sense Sense(String id, Fstruct f, Hashtable v) { super(f, v); this.id = id; numEx = 1; } // For the merging operation Sense(Sense s1, Sense s2, Hashtable virt, String id) { super(s1, s2, virt); this.id = id; numEx = s1.numEx + s2.numEx; } public String toString() { StringBuffer res = new StringBuffer().append(id).append(" (") .append(numEx).append(" ex)"); if (VerbLearn.context().params().verboseSenses()) { res.append(" {\n"); Enumeration fEnum = VerbLearn.context().params().featureNames(); while (fEnum.hasMoreElements()) { String f = (String)fEnum.nextElement(); res = res.append("\t").append(f).append(" {") .append((Dist)dists.get(f)).append("}\n"); } res = res.append("}\n"); } else { res.append("\n\t").append(new Fstruct(this).toString4()); } return res.toString(); } } /////////////////////////////////////////////////////////////////////// // Probabilistic Linking Feature Structures, used for word senses. // IN THIS VERSION, WE ASSUME ALL PFSTRUCTS ARE OVER ALL LINKING FEATURES. class PFstruct implements Serializable { protected Hashtable dists; // Should be "private protected" but it's gone w/Java 1.1... // Construct from a Fstruct and a set of virtual sample counts. PFstruct(Fstruct f, Hashtable v) { dists = new Hashtable(); // First, add the "virtual samples": Enumeration fEnum = v.keys(); while (fEnum.hasMoreElements()) { String fname = (String)fEnum.nextElement(); dists.put(fname, new Dist(fname, ((Double)v.get(fname)).doubleValue())); } // Next, add the actual sample, f, by incrementing the appropriate values: fEnum = f.features(); while (fEnum.hasMoreElements()) { String fname = (String)fEnum.nextElement(); ((Dist)dists.get(fname)).incrementValue((String)f.get(fname)); } } // Construct by merging two PFstructs. PFstruct(PFstruct pf1, PFstruct pf2, Hashtable v) { dists = new Hashtable(); Enumeration fEnum = pf1.dists.keys(); while (fEnum.hasMoreElements()) { String fname = (String)fEnum.nextElement(); dists.put(fname, new Dist(fname, (Dist)pf1.dists.get(fname), (Dist)pf2.dists.get(fname), ((Double)v.get(fname)).doubleValue())); } } public double likelihood(Fstruct f) { double res = 1; Enumeration fEnum = f.features(); while (fEnum.hasMoreElements()) { String fname = (String)fEnum.nextElement(); res = res * ((Dist)dists.get(fname)).likelihood(f.get(fname)); } return res; } // A heuristic measure of the similarity between two probabilistic // fstructs. Currently we multiply the similarities between the // distributions for each feature but this may or may not be the // best combination function. public double similarityTo(PFstruct pf) { double res = 1; Enumeration fEnum = dists.keys(); while (fEnum.hasMoreElements()) { String fname = (String)fEnum.nextElement(); res = res * ((Dist)dists.get(fname)). similarityTo((Dist)pf.dists.get(fname)); } return res; } public Dist getDist(String f) { return (Dist)dists.get(f); } } /////////////////////////////////////////////////////////////////////// // Discrete probability distribution class Dist implements Serializable { private String id; // holds the feature name private Hashtable counts; // maps String feature names to Double // These Double wrappers are horribly inefficient. private double total; // sum of counts (WITHOUT virtual samples added) private double virt; // The number of virtual samples which should be // implicitly added to each count. private double kVirt; // Cached value of virt * number of values // for this feature // Construct a new distribution with 'virt' virtual samples for each value // of feature 'fname'. Dist(String fname, double virt) { id = fname; this.virt = virt; kVirt = VerbLearn.context().params().featureVals(fname).size() * virt; counts = new Hashtable(); Enumeration valEnum = VerbLearn.context().params() .featureVals(fname).elements(); while (valEnum.hasMoreElements()) { setCount((String)valEnum.nextElement(), 0); } } // Construct a new distribution by merging two existing // distributions. NOTE: This version does not add virtual sample // component of each distribution, unlike Sather version. That // version did not allow distributions to grow more peaked with // further training examples. Instead, we use the current virtual // sample count of the slot, passed in 'virt'. This algorithm still // needs its mathematical justification worked out. Dist(String fname, Dist d1, Dist d2, double virt) { id = fname; // if (d1.virt != d2.virt) { // System.err.println("WARNING: Merging distributions with " + // "unequal virtuals\n"); // } this.virt = virt; kVirt = VerbLearn.context().params().featureVals(fname).size() * virt; counts = new Hashtable(); Enumeration valEnum = d1.counts.keys(); while (valEnum.hasMoreElements()) { String valName = (String)valEnum.nextElement(); setCount(valName, d1.getCount(valName) + d2.getCount(valName)); } total = total + d1.total + d2.total; } // Used to account for a new example of the feature set to "vname". public void incrementValue(String vname) { setCount(vname, getCount(vname) + 1); total = total + 1; } // Return the probability of a particular value for the feature.. public double likelihood(String val) { return (getCount(val) + virt) / (total + kVirt); } // Return the highest-probability feature value for this distribution. public String mode() { String leadingVal = null; double leadingCount = -1; Enumeration valEnum = counts.keys(); while (valEnum.hasMoreElements()) { String val = (String)valEnum.nextElement(); double count = getCount(val); if (count > leadingCount) { leadingCount = count; leadingVal = val; } } return leadingVal; } // Return the second-highest-probability feature value for this // distribution (or null if there is only one feature in the dist). private String runnerUp() { String leadingVal = null; double leadingCount = -1; String runnerUpVal = null; double runnerUpCount = -1; Enumeration valEnum = counts.keys(); while (valEnum.hasMoreElements()) { String val = (String)valEnum.nextElement(); double count = getCount(val); if (count > leadingCount) { runnerUpVal = leadingVal; runnerUpCount = leadingCount; leadingVal = val; leadingCount = count; } else if (count > runnerUpCount) { runnerUpVal = val; runnerUpCount = count; } } return runnerUpVal; } // A heuristic measure of the peakedness of the distribution. // Currently we use the ratio of the likelihood of the mode to // the likelihood of the runner-up to the mode. // Thus, peakedness ranges from 1.0 for a uniform distribution, // up to infinity for a spike distribution. public double peakedness() { String ru = runnerUp(); if (ru == null) return Double.POSITIVE_INFINITY; return likelihood(mode()) / likelihood(ru); } // A heuristic measure of the difference between two distributions. // Basically chi-square, except we use a normalized version to // factor out differences in number of samples, and then invert it // with 1/(1+chi) to give a measure ranging from 0 (very different) // to 1 (identical). public double similarityTo(Dist d) { double res = 1 / (1 + normalizedChiSquareWith(d)); return res; } // From Numerical Recipes 2nd ed., p. 623 ("r" and "s" in that // alg. map to "this" and "d" respectively). // Probabilities rather than counts are used, since otherwise // chi square is problematic because it also measures differences // in the number of samples in the two distributions. public double normalizedChiSquareWith(Dist d) { double res = 0; Enumeration iEnum = counts.keys(); while (iEnum.hasMoreElements()) { String i = (String)iEnum.nextElement(); double ri = likelihood(i); double si = d.likelihood(i); if (ri != 0 || si != 0) { res = res + Math.pow(ri - si, 2) / (ri + si); } } return res; } // This is used locally to reduce casting clutter. private double getCount(String val) { return ((Double)counts.get(val)).doubleValue(); } // This is used locally to reduce casting clutter. private void setCount(String val, double count) { counts.put(val, new Double(count)); } public String toString() { StringBuffer res = new StringBuffer(); String m = mode(); Enumeration valEnum = VerbLearn.context().params() .featureVals(id).elements(); while (valEnum.hasMoreElements()) { String valName = (String)valEnum.nextElement(); if (res.length() != 0) res.append(" "); // For ease of reading, capitalize the mode value (if peaked enough): if (valName.equals(m) && peakedness() >= VerbLearn.context().params().minSetFeature()) { res.append(valName.toUpperCase()); } else { res.append(valName); } res.append("=").append(Fmt.dbl(likelihood(valName))); // FOR COUNTS INSTEAD OF PROBABILITIES, USE: // ((Double)counts.get(valName)).doubleValue() + virt } return res.toString(); } } /////////////////////////////////////////////////////////////////////// // Linking Feature Structures class Fstruct implements Serializable { private Hashtable feats; // maps String feature names to String values // IF HIDEOUSLY SLOW, MAY GO BACK TO ARRAYS AND NUMERICAL VALS // Construct an empty Fstruct. Fstruct() { feats = new Hashtable(); } // Construct an Fstruct by copying another one. Fstruct(Fstruct f) { this(); Enumeration featEnum = f.features(); while (featEnum.hasMoreElements()) { String feat = (String)featEnum.nextElement(); set(feat, f.get(feat)); } } // Construct from a string of the form "f1=v1 f2=v2 f3=v3". Fstruct(String s) { this(); if (s == null || s.trim().equals("")) return; try { NestedStreamTokenizer input = new NestedStreamTokenizer(new StringReader(s)); int token = input.nextToken(); while (token != NestedStreamTokenizer.TT_EOF) { if (token != NestedStreamTokenizer.TT_WORD) { throw(new IOException("expected feature name")); } String f = input.sval; token = input.nextToken(); if (token != '=') { throw(new IOException("expected '='")); } token = input.nextToken(); if (token != NestedStreamTokenizer.TT_WORD) { throw(new IOException("expected feature value")); } feats.put(f, input.sval); token = input.nextToken(); } } catch (IOException e) { System.err.println("Fstruct-from-string exception: " + e.getMessage()); feats = new Hashtable(); } } // Construct a Fstruct corresponding to the MAP feature values in a // PFstruct (but only for those features whose peakedness >= minSetFeature). Fstruct(PFstruct pfstruct) { this(singletonArray(pfstruct)); } // Helper to the constructor Fstruct(PFstruct): private static PFstruct[] singletonArray(PFstruct pf) { PFstruct[] result = new PFstruct[1]; result[0] = pf; return result; } // Construct a Fstruct from an array of PFstructs, resolving // conflicts on a feature-by-feature basis by choosing the MAP value // from the most peaked distribution across the PFstructs for each // feature, so long as the peakedness >= minSetFeature. Entries // in the array may be null. Fstruct(PFstruct[] pfstructs) { this(); // Loop through all linking features: Enumeration fgen = VerbLearn.context().params().featureNames(); while (fgen.hasMoreElements()) { String f = (String)fgen.nextElement(); // Choose value from peakiest distribution: int winner = -1; double winnerPeakedness = -1; for (int i = 0; i < pfstructs.length; i++) { if (pfstructs[i] != null) { double currentPeakedness = pfstructs[i].getDist(f).peakedness(); if (currentPeakedness > winnerPeakedness) { winner = i; winnerPeakedness = currentPeakedness; } } } // Only set the feature if the winner's peakedness >= minSetFeature: // BUT IF THE FEATURE NAME IS "SCHEMA", SET IT REGARDLESS OF HOW // LOW THE PEAKEDNESS MIGHT BE. WE NEED TO EXECUTE *SOME* ACTION... if (winner != -1 && (winnerPeakedness >= VerbLearn.context().params().minSetFeature() || f.equals("schema"))) { set(f, pfstructs[winner].getDist(f).mode()); } } } // Construct a Fstruct from an array of PFstructs as in the above // constructor, except that any features in the Fstruct "override" // take precedence. Fstruct(PFstruct[] pfstructs, Fstruct override) { this(pfstructs); Enumeration fEnum = override.features(); while (fEnum.hasMoreElements()) { String fname = (String)fEnum.nextElement(); set(fname, override.get(fname)); } } // Set the value of a feature. public void set(String f, String val) { feats.put(f, val); } // Return whether a feature is currently set. public boolean isSet(String f) { return feats.containsKey(f); } // Remove a currently set feature. public void unset(String f) { feats.remove(f); } // Get the value of a feature. public String get(String f) { return (String)feats.get(f); } // Remove features from self which are "explained" by the PFstruct. public void explainAway(PFstruct pfstruct) { // Loop through all linking features: Enumeration fgen = feats.keys(); while (fgen.hasMoreElements()) { String f = (String)fgen.nextElement(); if (pfstruct.getDist(f).likelihood(get(f)) > VerbLearn.context().params().minExplain()) { unset(f); } } } // Returns an Enumeration containing the feature names // which are set in this fstruct. public Enumeration features() { return feats.keys(); } public String toString() { StringBuffer res = new StringBuffer("{\n"); Enumeration fEnum = VerbLearn.context().params().featureNames(); boolean firstTime = true; while (fEnum.hasMoreElements()) { String fname = (String)fEnum.nextElement(); if (isSet(fname)) { if (firstTime) { firstTime = false; } else { res.append(" "); } res.append(" " + fname + "=" + get(fname) + "\n"); } } res.append("}\n"); return res.toString(); } // String version with 4 features per line of text. public String toString4() { StringBuffer res = new StringBuffer().append("{"); Enumeration fEnum = VerbLearn.context().params().featureNames(); boolean firstTime = true; int counter = 0; while (fEnum.hasMoreElements()) { String fname = (String)fEnum.nextElement(); if (isSet(fname)) { if (firstTime) { firstTime = false; } else { if (counter > 0) res.append(" "); else res.append("\n\t"); } counter = (counter + 1) % 4; // % is modulus res.append(fname + "=" + get(fname)); } } res.append("}\n"); return res.toString(); } }