/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.io.NumberRangesFileFilter;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.parser.lexparser.ArabicUnknownWordSignatures;
import edu.stanford.nlp.parser.lexparser.IntTaggedWord;
import edu.stanford.nlp.parser.lexparser.Lexicon;
import edu.stanford.nlp.parser.lexparser.Options;
import edu.stanford.nlp.parser.lexparser.Test;
import edu.stanford.nlp.parser.lexparser.Train;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.trees.DiskTreebank;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Numberer;
import edu.stanford.nlp.util.StringUtils;
import java.io.BufferedReader;
import java.io.FileFilter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class BaseLexicon
implements Lexicon {
    private static final boolean DEBUG_LEXICON = false;
    protected static final int nullWord = -1;
    protected static final short nullTag = -1;
    protected int unknownLevel;
    protected int smoothInUnknownsThreshold;
    protected boolean smartMutation;
    protected transient List<IntTaggedWord>[] rulesWithWord;
    protected transient Set<IntTaggedWord> tags = new HashSet<IntTaggedWord>();
    protected transient Set<IntTaggedWord> words = new HashSet<IntTaggedWord>();
    protected Counter<IntTaggedWord> seenCounter = new Counter();
    protected Counter<IntTaggedWord> unSeenCounter = new Counter();
    protected transient int lastSignatureIndex = -1;
    protected transient int lastSentencePosition = -1;
    protected transient int lastWordToSignaturize = -1;
    double[] smooth = new double[]{1.0, 1.0};
    transient double[][] m_TT = null;
    transient double[] m_T = null;
    private static final int MIN_UNKNOWN = 0;
    private static final int MAX_UNKNOWN = 8;
    private transient int debugLastWord = -1;
    private transient int debugLoc = -1;
    private transient StringBuilder debugProbs;
    private transient StringBuilder debugNoProbs;
    private transient String debugPrefix;
    private static final int STATS_BINS = 15;
    private static final long serialVersionUID = 40L;

    public BaseLexicon() {
        this(new Options.LexOptions());
    }

    public BaseLexicon(Options.LexOptions op) {
        this.unknownLevel = op.useUnknownWordSignatures;
        if (this.unknownLevel < 0 || this.unknownLevel > 8) {
            if (this.unknownLevel < 0) {
                this.unknownLevel = 0;
            } else if (this.unknownLevel > 8) {
                this.unknownLevel = 8;
            }
            System.err.println("Invalid value for useUnknownWordSignatures");
        }
        this.smoothInUnknownsThreshold = op.smoothInUnknownsThreshold;
        this.smartMutation = op.smartMutation;
    }

    @Override
    public boolean isKnown(int word) {
        if (this.rulesWithWord == null) {
            this.initRulesWithWord();
        }
        return word < this.rulesWithWord.length && this.rulesWithWord[word].size() > 0;
    }

    @Override
    public boolean isKnown(String word) {
        IntTaggedWord iW = new IntTaggedWord(Numberer.number("words", word), -1);
        return this.seenCounter.getCount(iW) > 0.0;
    }

    @Override
    public Iterator<IntTaggedWord> ruleIteratorByWord(int word, int loc) {
        if (this.isKnown(word)) {
            return this.rulesWithWord[word].iterator();
        }
        ArrayList<IntTaggedWord> newTaggings = new ArrayList<IntTaggedWord>(40);
        for (IntTaggedWord iTW : this.rulesWithWord[Numberer.number("words", "UNK")]) {
            newTaggings.add(new IntTaggedWord(word, iTW.tag));
        }
        return newTaggings.iterator();
    }

    protected void initRulesWithWord() {
        int unkWord = Numberer.number("words", "UNK");
        int numWords = Numberer.getGlobalNumberer("words").total();
        this.rulesWithWord = new List[numWords];
        for (int w = 0; w < numWords; ++w) {
            this.rulesWithWord[w] = new ArrayList<IntTaggedWord>(1);
        }
        HashSet<IntTaggedWord> tags = new HashSet<IntTaggedWord>();
        for (IntTaggedWord iTW : this.seenCounter.keySet()) {
            if (iTW.tag() == -1) continue;
            if (iTW.word() == -1) {
                tags.add(iTW);
                continue;
            }
            this.rulesWithWord[iTW.word].add(iTW);
        }
        for (IntTaggedWord iT : tags) {
            double types = this.unSeenCounter.getCount(iT);
            if (!(types > (double)Train.openClassTypesThreshold)) continue;
            IntTaggedWord iTW = new IntTaggedWord(unkWord, iT.tag);
            this.rulesWithWord[iTW.word].add(iTW);
        }
        if (Test.verbose) {
            System.err.print("Open class tags are: [");
            for (IntTaggedWord item : this.rulesWithWord[unkWord]) {
                System.err.print(" " + Numberer.object("tags", item.tag()));
            }
            System.err.print("] ");
        }
    }

    protected List<IntTaggedWord> treeToEvents(Tree tree) {
        Sentence taggedWords = tree.taggedYield();
        return this.listToEvents(taggedWords);
    }

    protected List<IntTaggedWord> listToEvents(List<TaggedWord> taggedWords) {
        ArrayList<IntTaggedWord> itwList = new ArrayList<IntTaggedWord>();
        for (TaggedWord tw : taggedWords) {
            IntTaggedWord iTW = new IntTaggedWord(Numberer.number("words", tw.word()), Numberer.number("tags", tw.tag()));
            itwList.add(iTW);
        }
        return itwList;
    }

    public void addAll(List<TaggedWord> tagWords) {
        this.addAll(tagWords, 1.0);
    }

    public void addAll(List<TaggedWord> taggedWords, double weight) {
        List<IntTaggedWord> tagWords = this.listToEvents(taggedWords);
    }

    public void trainWithExpansion(Collection<TaggedWord> taggedWords) {
    }

    @Override
    public void train(Collection<Tree> trees) {
        this.train(trees, 1.0);
    }

    public void train(Collection<Tree> trees, double weight) {
        int tNum = 0;
        int tSize = trees.size();
        for (Tree tree : trees) {
            ++tNum;
            List<IntTaggedWord> taggedWords = this.treeToEvents(tree);
            for (int w = 0; w < taggedWords.size(); ++w) {
                IntTaggedWord iTW = taggedWords.get(w);
                this.seenCounter.incrementCount(iTW, weight);
                IntTaggedWord iT = new IntTaggedWord(-1, iTW.tag);
                this.seenCounter.incrementCount(iT, weight);
                IntTaggedWord iW = new IntTaggedWord(iTW.word, -1);
                this.seenCounter.incrementCount(iW, weight);
                IntTaggedWord i = new IntTaggedWord(-1, -1);
                this.seenCounter.incrementCount(i, weight);
                this.tags.add(iT);
                this.words.add(iW);
                if (tNum <= (int)((double)tSize * Train.fractionBeforeUnseenCounting) || !(this.seenCounter.getCount(iW) < 2.0)) continue;
                int s = this.getSignatureIndex(iTW.word, w);
                Numberer.number("words", "UNK");
                IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag);
                IntTaggedWord iS = new IntTaggedWord(s, -1);
                this.unSeenCounter.incrementCount(iTS, weight);
                this.unSeenCounter.incrementCount(iT, weight);
                this.unSeenCounter.incrementCount(iS, weight);
                this.unSeenCounter.incrementCount(i, weight);
            }
        }
        this.tune(trees);
    }

    protected void addTagging(boolean seen, IntTaggedWord itw, double count) {
        if (seen) {
            this.seenCounter.incrementCount(itw, count);
            if (itw.tag() == -1) {
                this.words.add(itw);
            } else if (itw.word() == -1) {
                this.tags.add(itw);
            }
        } else {
            this.unSeenCounter.incrementCount(itw, count);
        }
    }

    protected int getSignatureIndex(int wordIndex, int sentencePosition) {
        int sig;
        if (wordIndex == this.lastWordToSignaturize && sentencePosition == this.lastSentencePosition) {
            return this.lastSignatureIndex;
        }
        String uwSig = this.getSignature((String)Numberer.object("words", wordIndex), sentencePosition);
        this.lastSignatureIndex = sig = Numberer.number("words", uwSig);
        this.lastSentencePosition = sentencePosition;
        this.lastWordToSignaturize = wordIndex;
        return sig;
    }

    protected String getSignature(String word, int loc) {
        StringBuilder sb = new StringBuilder("UNK");
        switch (this.unknownLevel) {
            case 8: {
                boolean allDigitPlus;
                if (word.startsWith("Al")) {
                    sb.append("-Al");
                }
                if (allDigitPlus = ArabicUnknownWordSignatures.allDigitPlus(word)) {
                    sb.append("-NUM");
                } else {
                    sb.append("-" + word.charAt(0));
                }
                sb.append(ArabicUnknownWordSignatures.likelyAdjectivalSuffix(word));
                sb.append(ArabicUnknownWordSignatures.pastTenseVerbNumberSuffix(word));
                sb.append(ArabicUnknownWordSignatures.presentTenseVerbNumberSuffix(word));
                sb.append(ArabicUnknownWordSignatures.taaMarbuuTaSuffix(word));
                sb.append(ArabicUnknownWordSignatures.abstractionNounSuffix(word));
            }
            case 7: {
                boolean allDigitPlus = ArabicUnknownWordSignatures.allDigitPlus(word);
                if (allDigitPlus) {
                    sb.append("-NUM");
                    break;
                }
                sb.append(word.charAt(word.length() - 1));
                break;
            }
            case 6: {
                boolean allDigitPlus;
                if (word.startsWith("Al")) {
                    sb.append("-Al");
                }
                if (allDigitPlus = ArabicUnknownWordSignatures.allDigitPlus(word)) {
                    sb.append("-NUM");
                    break;
                }
                sb.append(word.charAt(word.length() - 1));
                break;
            }
            case 5: {
                int wlen = word.length();
                int numCaps = 0;
                boolean hasDigit = false;
                boolean hasDash = false;
                boolean hasLower = false;
                for (int i = 0; i < wlen; ++i) {
                    char ch = word.charAt(i);
                    if (Character.isDigit(ch)) {
                        hasDigit = true;
                        continue;
                    }
                    if (ch == '-') {
                        hasDash = true;
                        continue;
                    }
                    if (!Character.isLetter(ch)) continue;
                    if (Character.isLowerCase(ch)) {
                        hasLower = true;
                        continue;
                    }
                    if (Character.isTitleCase(ch)) {
                        hasLower = true;
                        ++numCaps;
                        continue;
                    }
                    ++numCaps;
                }
                char ch0 = word.charAt(0);
                String lowered = word.toLowerCase();
                if (Character.isUpperCase(ch0) || Character.isTitleCase(ch0)) {
                    if (loc == 0 && numCaps == 1) {
                        sb.append("-INITC");
                        if (this.isKnown(lowered)) {
                            sb.append("-KNOWNLC");
                        }
                    } else {
                        sb.append("-CAPS");
                    }
                } else if (!Character.isLetter(ch0) && numCaps > 0) {
                    sb.append("-CAPS");
                } else if (hasLower) {
                    sb.append("-LC");
                }
                if (hasDigit) {
                    sb.append("-NUM");
                }
                if (hasDash) {
                    sb.append("-DASH");
                }
                if (lowered.endsWith("s") && wlen >= 3) {
                    char ch2 = lowered.charAt(wlen - 2);
                    if (ch2 == 's' || ch2 == 'i' || ch2 == 'u') break;
                    sb.append("-s");
                    break;
                }
                if (word.length() < 5 || hasDash || hasDigit && numCaps > 0) break;
                if (lowered.endsWith("ed")) {
                    sb.append("-ed");
                    break;
                }
                if (lowered.endsWith("ing")) {
                    sb.append("-ing");
                    break;
                }
                if (lowered.endsWith("ion")) {
                    sb.append("-ion");
                    break;
                }
                if (lowered.endsWith("er")) {
                    sb.append("-er");
                    break;
                }
                if (lowered.endsWith("est")) {
                    sb.append("-est");
                    break;
                }
                if (lowered.endsWith("ly")) {
                    sb.append("-ly");
                    break;
                }
                if (lowered.endsWith("ity")) {
                    sb.append("-ity");
                    break;
                }
                if (lowered.endsWith("y")) {
                    sb.append("-y");
                    break;
                }
                if (!lowered.endsWith("al")) break;
                sb.append("-al");
                break;
            }
            case 4: {
                char ch;
                boolean hasDigit = false;
                boolean hasNonDigit = false;
                boolean hasLetter = false;
                boolean hasLower = false;
                boolean hasDash = false;
                boolean hasPeriod = false;
                boolean hasComma = false;
                for (int i = 0; i < word.length(); ++i) {
                    char ch2 = word.charAt(i);
                    if (Character.isDigit(ch2)) {
                        hasDigit = true;
                        continue;
                    }
                    hasNonDigit = true;
                    if (Character.isLetter(ch2)) {
                        hasLetter = true;
                        if (!Character.isLowerCase(ch2) && !Character.isTitleCase(ch2)) continue;
                        hasLower = true;
                        continue;
                    }
                    if (ch2 == '-') {
                        hasDash = true;
                        continue;
                    }
                    if (ch2 == '.') {
                        hasPeriod = true;
                        continue;
                    }
                    if (ch2 != ',') continue;
                    hasComma = true;
                }
                if (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0))) {
                    if (!hasLower) {
                        sb.append("-AC");
                    } else if (loc == 0) {
                        sb.append("-SC");
                    } else {
                        sb.append("-C");
                    }
                } else if (hasLower) {
                    sb.append("-L");
                } else if (hasLetter) {
                    sb.append("-U");
                } else {
                    sb.append("-S");
                }
                if (hasDigit && !hasNonDigit) {
                    sb.append("-N");
                } else if (hasDigit) {
                    sb.append("-n");
                }
                if (hasDash) {
                    sb.append("-H");
                }
                if (hasPeriod) {
                    sb.append("-P");
                }
                if (hasComma) {
                    sb.append("-C");
                }
                if (word.length() <= 3 || !Character.isLetter(ch = word.charAt(word.length() - 1))) break;
                sb.append("-");
                sb.append(Character.toLowerCase(ch));
                break;
            }
            case 3: {
                sb.append("-");
                int lastClass = 45;
                int num = 0;
                for (int i = 0; i < word.length(); ++i) {
                    char ch = word.charAt(i);
                    int newClass = Character.isUpperCase(ch) || Character.isTitleCase(ch) ? (loc == 0 ? 83 : 76) : (Character.isLetter(ch) ? 108 : (Character.isDigit(ch) ? 100 : (ch == '-' ? 104 : (ch == '.' ? 112 : 115))));
                    if (newClass != lastClass) {
                        lastClass = newClass;
                        sb.append((char)lastClass);
                        num = 1;
                        continue;
                    }
                    if (num < 2) {
                        sb.append('+');
                    }
                    ++num;
                }
                if (word.length() <= 3) break;
                char ch = Character.toLowerCase(word.charAt(word.length() - 1));
                sb.append('-');
                sb.append(ch);
                break;
            }
            case 2: {
                boolean hasDigit = false;
                boolean hasNonDigit = false;
                boolean hasLower = false;
                int wlen = word.length();
                for (int i = 0; i < wlen; ++i) {
                    char ch = word.charAt(i);
                    if (Character.isDigit(ch)) {
                        hasDigit = true;
                        continue;
                    }
                    hasNonDigit = true;
                    if (!Character.isLetter(ch) || !Character.isLowerCase(ch) && !Character.isTitleCase(ch)) continue;
                    hasLower = true;
                }
                if (wlen > 0 && (Character.isUpperCase(word.charAt(0)) || Character.isTitleCase(word.charAt(0)))) {
                    if (!hasLower) {
                        sb.append("-ALLC");
                    } else if (loc == 0) {
                        sb.append("-INIT");
                    } else {
                        sb.append("-UC");
                    }
                } else if (hasLower) {
                    sb.append("-LC");
                }
                if (word.indexOf(45) >= 0) {
                    sb.append("-DASH");
                }
                if (hasDigit) {
                    if (!hasNonDigit) {
                        sb.append("-NUM");
                        break;
                    }
                    sb.append("-DIG");
                    break;
                }
                if (wlen <= 3) break;
                char ch = word.charAt(word.length() - 1);
                sb.append(Character.toLowerCase(ch));
                break;
            }
            case 1: {
                sb.append("-");
                sb.append(word.substring(Math.max(word.length() - 2, 0), word.length()));
                sb.append("-");
                if (Character.isLowerCase(word.charAt(0))) {
                    sb.append("LOWER");
                    break;
                }
                if (Character.isUpperCase(word.charAt(0))) {
                    if (loc == 0) {
                        sb.append("INIT");
                        break;
                    }
                    sb.append("UPPER");
                    break;
                }
                sb.append("OTHER");
            }
        }
        return sb.toString();
    }

    void buildPT_T() {
        Numberer tagNumberer = Numberer.getGlobalNumberer("tags");
        int numTags = tagNumberer.total();
        this.m_TT = new double[numTags][numTags];
        this.m_T = new double[numTags];
        double[] tmp = new double[numTags];
        for (IntTaggedWord word : this.words) {
            int t;
            IntTaggedWord iTW = new IntTaggedWord(word.word, -1);
            double tot = 0.0;
            for (t = 0; t < numTags; ++t) {
                iTW.tag = (short)t;
                tmp[t] = this.seenCounter.getCount(iTW);
                tot += tmp[t];
            }
            if (tot < 10.0) continue;
            for (t = 0; t < numTags; ++t) {
                for (int t2 = 0; t2 < numTags; ++t2) {
                    if (!(tmp[t2] > 0.0)) continue;
                    double c = tmp[t] / tot;
                    int n = t;
                    this.m_T[n] = this.m_T[n] + c;
                    double[] dArray = this.m_TT[t2];
                    int n2 = t;
                    dArray[n2] = dArray[n2] + c;
                }
            }
        }
    }

    @Override
    public float score(IntTaggedWord iTW, int loc) {
        double pb_W_T;
        boolean seen;
        int word = iTW.word;
        short tag = iTW.tag;
        iTW.tag = (short)-1;
        double c_W = this.seenCounter.getCount(iTW);
        iTW.tag = tag;
        boolean bl = seen = c_W > 0.0;
        if (seen) {
            double pb_T_W;
            double c_TW = this.seenCounter.getCount(iTW);
            iTW.word = -1;
            double c_T = this.seenCounter.getCount(iTW);
            double c_Tunseen = this.unSeenCounter.getCount(iTW);
            iTW.tag = (short)-1;
            double total = this.seenCounter.getCount(iTW);
            double totalUnseen = this.unSeenCounter.getCount(iTW);
            iTW.tag = tag;
            iTW.word = word;
            double p_T_U = c_Tunseen / totalUnseen;
            if (c_W > (double)this.smoothInUnknownsThreshold) {
                pb_T_W = c_TW / c_W;
            } else {
                if (this.smartMutation) {
                    Numberer tagNumberer = Numberer.getGlobalNumberer("tags");
                    int numTags = tagNumberer.total();
                    if (this.m_TT == null || numTags != this.m_T.length) {
                        this.buildPT_T();
                    }
                    p_T_U *= 0.1;
                    for (int t = 0; t < numTags; ++t) {
                        IntTaggedWord iTW2 = new IntTaggedWord(word, t);
                        double p_T_W2 = this.seenCounter.getCount(iTW2) / c_W;
                        if (!(p_T_W2 > 0.0)) continue;
                        p_T_U += p_T_W2 * this.m_TT[tag][t] / this.m_T[t] * 0.9;
                    }
                }
                pb_T_W = (c_TW + this.smooth[1] * p_T_U) / (c_W + this.smooth[1]);
            }
            double p_T = c_T / total;
            double p_W = c_W / total;
            pb_W_T = Math.log(pb_T_W * p_W / p_T);
        } else {
            int sig;
            iTW.word = sig = this.getSignatureIndex(iTW.word, loc);
            double c_TS = this.unSeenCounter.getCount(iTW);
            iTW.tag = (short)-1;
            double c_S = this.unSeenCounter.getCount(iTW);
            iTW.word = -1;
            double c_U = this.unSeenCounter.getCount(iTW);
            double total = this.seenCounter.getCount(iTW);
            iTW.tag = tag;
            double c_T = this.unSeenCounter.getCount(iTW);
            double c_Tseen = this.seenCounter.getCount(iTW);
            iTW.word = word;
            double p_T_U = c_T / c_U;
            if (this.unknownLevel == 0) {
                c_TS = 0.0;
                c_S = 0.0;
            }
            double pb_T_S = (c_TS + this.smooth[0] * p_T_U) / (c_S + this.smooth[0]);
            double p_T = c_Tseen / total;
            double p_W = 1.0 / total;
            pb_W_T = Math.log(pb_T_S * p_W / p_T);
        }
        if (pb_W_T > -100.0) {
            return (float)pb_W_T;
        }
        return Float.NEGATIVE_INFINITY;
    }

    public void tune(Collection<Tree> trees) {
        double bestScore = Double.NEGATIVE_INFINITY;
        double[] bestSmooth = new double[]{0.0, 0.0};
        this.smooth[0] = 1.0;
        while (this.smooth[0] <= 1.0) {
            this.smooth[1] = 0.2;
            while (this.smooth[1] <= 0.2) {
                double score = 0.0;
                if (Test.verbose) {
                    System.out.println("Tuning lexicon: s0 " + this.smooth[0] + " s1 " + this.smooth[1] + " is " + score + " " + trees.size() + " trees.");
                }
                if (score > bestScore) {
                    System.arraycopy(this.smooth, 0, bestSmooth, 0, this.smooth.length);
                    bestScore = score;
                }
                this.smooth[1] = this.smooth[1] * 2.0;
            }
            this.smooth[0] = this.smooth[0] * 2.0;
        }
        System.arraycopy(bestSmooth, 0, this.smooth, 0, bestSmooth.length);
        if (this.smartMutation) {
            this.smooth[0] = 8.0;
            this.smooth[1] = 0.1;
        }
        if (Test.unseenSmooth > 0.0) {
            this.smooth[0] = Test.unseenSmooth;
        }
        if (Test.verbose) {
            System.out.println("Tuning selected smoothUnseen " + this.smooth[0] + " smoothSeen " + this.smooth[1] + " at " + bestScore);
        }
    }

    @Override
    public void readData(BufferedReader in) throws IOException {
        String SEEN = "SEEN";
        int lineNum = 1;
        String line = in.readLine();
        Pattern p = Pattern.compile("^smooth\\[([0-9])\\] = (.*)$");
        while (line != null && line.length() > 0) {
            try {
                Matcher m = p.matcher(line);
                if (m.matches()) {
                    int i = Integer.parseInt(m.group(1));
                    this.smooth[i] = Double.parseDouble(m.group(2));
                } else {
                    String[] fields = StringUtils.splitOnCharWithQuoting(line, ' ', '\"', '\\');
                    boolean seen = fields[3].equals("SEEN");
                    this.addTagging(seen, new IntTaggedWord(fields[2], fields[0]), Double.parseDouble(fields[4]));
                }
            }
            catch (RuntimeException e) {
                throw new IOException("Error on line " + lineNum + ": " + line);
            }
            ++lineNum;
            line = in.readLine();
        }
    }

    @Override
    public void writeData(Writer w) throws IOException {
        PrintWriter out2 = new PrintWriter(w);
        for (IntTaggedWord itw : this.seenCounter.keySet()) {
            out2.println(itw.toLexicalEntry() + " SEEN " + this.seenCounter.getCount(itw));
        }
        for (IntTaggedWord itw : this.unSeenCounter.keySet()) {
            out2.println(itw.toLexicalEntry() + " UNSEEN " + this.unSeenCounter.getCount(itw));
        }
        for (int i = 0; i < this.smooth.length; ++i) {
            out2.println("smooth[" + i + "] = " + this.smooth[i]);
        }
        out2.flush();
    }

    public void printLexStats() {
        int j;
        if (this.rulesWithWord == null) {
            this.initRulesWithWord();
        }
        System.out.println("BaseLexicon statistics");
        System.out.println("unknownLevel is " + this.unknownLevel);
        int accumulated = 0;
        for (List<IntTaggedWord> lis : this.rulesWithWord) {
            accumulated += lis.size();
        }
        System.out.println("Sum of rulesWithWord: " + accumulated);
        System.out.println("Tags size: " + this.tags.size());
        System.out.println("Words size: " + this.words.size());
        System.out.println("rulesWithWord length: " + this.rulesWithWord.length + " [should be sum of words + sigs]");
        int[] lengths = new int[15];
        ArrayList[] words = new ArrayList[15];
        for (j = 0; j < 15; ++j) {
            words[j] = new ArrayList();
        }
        for (int i = 0; i < this.rulesWithWord.length; ++i) {
            int num = this.rulesWithWord[i].size();
            if (num > 14) {
                num = 14;
            }
            int n = num;
            lengths[n] = lengths[n] + 1;
            if (num < 7) continue;
            words[num].add(Numberer.object("words", i));
        }
        System.out.println("Stats on how many taggings for how many words");
        for (j = 0; j < 15; ++j) {
            System.out.print(j + " taggings: " + lengths[j] + " words ");
            if (j >= 7) {
                System.out.print(words[j]);
            }
            System.out.println();
        }
        NumberFormat nf = NumberFormat.getNumberInstance();
        nf.setMaximumFractionDigits(0);
        System.out.println("Unseen counter: " + this.unSeenCounter.toString(nf));
    }

    public double evaluateCoverage(Collection<Tree> trees, Set missingWords, Set missingTags, Set<IntTaggedWord> missingTW) {
        ArrayList<IntTaggedWord> iTW1 = new ArrayList<IntTaggedWord>();
        for (Tree t : trees) {
            iTW1.addAll(this.treeToEvents(t));
        }
        int total = 0;
        int unseen = 0;
        for (IntTaggedWord itw : iTW1) {
            ++total;
            if (!this.words.contains(new IntTaggedWord(itw.word(), -1))) {
                missingWords.add(Numberer.object("word", itw.word()));
            }
            if (!this.tags.contains(new IntTaggedWord(-1, itw.tag()))) {
                missingTags.add(Numberer.object("tag", itw.tag()));
            }
            if (this.seenCounter.getCount(itw) != 0.0) continue;
            ++unseen;
            missingTW.add(itw);
        }
        return (double)unseen / (double)total;
    }

    public static void main(String[] args) {
        DiskTreebank tb = new DiskTreebank();
        tb.loadPath(args[0], (FileFilter)new NumberRangesFileFilter(args[1], true));
        BaseLexicon lex = new BaseLexicon();
        lex.unknownLevel = Integer.parseInt(args[2]);
        lex.train(tb);
        Numberer numb = Numberer.getGlobalNumberer("tags");
        NumberFormat nf = NumberFormat.getNumberInstance();
        nf.setMaximumFractionDigits(6);
        ArrayList<String> impos = new ArrayList<String>();
        for (int i = 3; i < args.length; ++i) {
            if (lex.isKnown(args[i])) {
                System.out.println(args[i] + " is a known word.");
            } else {
                Set tags = numb.objects();
                impos.clear();
                ArrayList lis = new ArrayList(tags);
                Collections.sort(lis);
                for (String tStr : lis) {
                    IntTaggedWord iTW = new IntTaggedWord(args[i], tStr);
                    double score = lex.score(iTW, 1);
                    if (score == Double.NEGATIVE_INFINITY) {
                        impos.add(tStr);
                        continue;
                    }
                    System.out.println(StringUtils.pad(args[i] + " / " + tStr, 24) + nf.format(score));
                }
                if (impos.size() > 0) {
                    System.out.println(args[i] + " impossible tags: " + impos);
                }
            }
            System.out.println();
        }
    }
}

