/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.math.ArrayMath;
import edu.stanford.nlp.parser.lexparser.ChineseCharacterBasedLexicon;
import edu.stanford.nlp.parser.lexparser.IntTaggedWord;
import edu.stanford.nlp.parser.lexparser.WordSegmenter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Distribution;
import edu.stanford.nlp.stats.GeneralizedCounter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Treebank;
import edu.stanford.nlp.trees.international.pennchinese.CharacterLevelTagExtender;
import edu.stanford.nlp.util.Numberer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class ChineseMarkovWordSegmenter
implements WordSegmenter {
    Distribution initialPOSDist;
    Map markovPOSDists;
    ChineseCharacterBasedLexicon lex;
    Set POSes;
    private static final long serialVersionUID = 1559606198270645508L;

    public ChineseMarkovWordSegmenter(ChineseCharacterBasedLexicon lex) {
        this.lex = lex;
    }

    public ChineseMarkovWordSegmenter() {
        this.lex = new ChineseCharacterBasedLexicon();
    }

    public void train(Collection trees) {
        Numberer tagNumberer = Numberer.getGlobalNumberer("tags");
        this.lex.train(trees);
        Counter<String> initial = new Counter<String>();
        GeneralizedCounter ruleCounter = new GeneralizedCounter(2);
        for (Tree tree : trees) {
            List<Label> tags = tree.preTerminalYield();
            String last = null;
            Iterator<Label> iter = tags.iterator();
            while (iter.hasNext()) {
                String tag = iter.next().value();
                tagNumberer.number(tag);
                if (last == null) {
                    initial.incrementCount(tag);
                } else {
                    ruleCounter.incrementCount2D(last, tag);
                }
                last = tag;
            }
        }
        int numTags = tagNumberer.total();
        this.POSes = new HashSet(tagNumberer.objects());
        this.initialPOSDist = Distribution.laplaceSmoothedDistribution(initial, numTags, 0.5);
        this.markovPOSDists = new HashMap();
        Set entries = ruleCounter.lowestLevelCounterEntrySet();
        for (Map.Entry entry : entries) {
            Distribution d = Distribution.laplaceSmoothedDistribution((Counter)entry.getValue(), numTags, 0.5);
            this.markovPOSDists.put(((List)entry.getKey()).get(0), d);
        }
    }

    public Sentence segmentWords(String s) {
        return this.segmentWordsWithMarkov(s);
    }

    public Sentence basicSegmentWords(String s) {
        int end;
        int start;
        int diff;
        Numberer tagNumberer = Numberer.getGlobalNumberer("tags");
        int length = s.length();
        double[][] scores = new double[length][length + 1];
        int[][] splitBacktrace = new int[length][length + 1];
        int[][] POSbacktrace = new int[length][length + 1];
        for (int i = 0; i < length; ++i) {
            Arrays.fill(scores[i], Double.NEGATIVE_INFINITY);
        }
        for (diff = 1; diff <= 10; ++diff) {
            start = 0;
            while (start + diff <= length) {
                end = start + diff;
                StringBuffer wordBuf = new StringBuffer();
                for (int pos = start; pos < end; ++pos) {
                    wordBuf.append(s.charAt(pos));
                }
                String word = wordBuf.toString();
                for (String tag : this.POSes) {
                    IntTaggedWord itw = new IntTaggedWord(word, tag);
                    double newScore = (double)this.lex.score(itw, 0) + Math.log(this.lex.getPOSDistribution().probabilityOf(tag));
                    if (!(newScore > scores[start][end])) continue;
                    scores[start][end] = newScore;
                    splitBacktrace[start][end] = end;
                    POSbacktrace[start][end] = itw.tag();
                }
                ++start;
            }
        }
        for (diff = 2; diff <= length; ++diff) {
            start = 0;
            while (start + diff <= length) {
                end = start + diff;
                for (int split = start + 1; split < end && split - start <= 10; ++split) {
                    double newScore;
                    if (splitBacktrace[start][split] != split || !((newScore = scores[start][split] + scores[split][end]) > scores[start][end])) continue;
                    scores[start][end] = newScore;
                    splitBacktrace[start][end] = split;
                }
                ++start;
            }
        }
        ArrayList<TaggedWord> words = new ArrayList<TaggedWord>();
        start = 0;
        while (start < length) {
            end = splitBacktrace[start][length];
            StringBuffer wordBuf = new StringBuffer();
            for (int pos = start; pos < end; ++pos) {
                wordBuf.append(s.charAt(pos));
            }
            String word = wordBuf.toString();
            String tag = (String)tagNumberer.object(POSbacktrace[start][end]);
            words.add(new TaggedWord(word, tag));
            start = end;
        }
        return new Sentence((Collection<? extends HasWord>)words);
    }

    public Sentence segmentWordsWithMarkov(String s) {
        int end;
        int start;
        int diff;
        Numberer tagNumberer = Numberer.getGlobalNumberer("tags");
        int length = s.length();
        int numTags = this.POSes.size();
        double[][][] scores = new double[length][length + 1][numTags];
        int[][][] splitBacktrace = new int[length][length + 1][numTags];
        int[][][] POSbacktrace = new int[length][length + 1][numTags];
        for (int i = 0; i < length; ++i) {
            for (int j = 0; j < length + 1; ++j) {
                Arrays.fill(scores[i][j], Double.NEGATIVE_INFINITY);
            }
        }
        for (diff = 1; diff <= 10; ++diff) {
            start = 0;
            while (start + diff <= length) {
                end = start + diff;
                StringBuffer wordBuf = new StringBuffer();
                for (int pos = start; pos < end; ++pos) {
                    wordBuf.append(s.charAt(pos));
                }
                String word = wordBuf.toString();
                for (String tag : this.POSes) {
                    IntTaggedWord itw = new IntTaggedWord(word, tag);
                    double score = this.lex.score(itw, 0);
                    if (start == 0) {
                        score += Math.log(this.initialPOSDist.probabilityOf(tag));
                    }
                    scores[start][end][itw.tag()] = score;
                    splitBacktrace[start][end][itw.tag()] = end;
                }
                ++start;
            }
        }
        for (diff = 2; diff <= length; ++diff) {
            start = 0;
            while (start + diff <= length) {
                end = start + diff;
                for (int split = start + 1; split < end && split - start <= 10; ++split) {
                    for (String tag : this.POSes) {
                        Distribution rTagDist;
                        int tagNum = tagNumberer.number(tag);
                        if (splitBacktrace[start][split][tagNum] != split || (rTagDist = (Distribution)this.markovPOSDists.get(tag)) == null) continue;
                        for (String rTag : this.POSes) {
                            int rTagNum = tagNumberer.number(rTag);
                            double newScore = scores[start][split][tagNum] + scores[split][end][rTagNum] + Math.log(rTagDist.probabilityOf(rTag));
                            if (!(newScore > scores[start][end][tagNum])) continue;
                            scores[start][end][tagNum] = newScore;
                            splitBacktrace[start][end][tagNum] = split;
                            POSbacktrace[start][end][tagNum] = rTagNum;
                        }
                    }
                }
                ++start;
            }
        }
        int nextPOS = ArrayMath.argmax(scores[0][length]);
        ArrayList<TaggedWord> words = new ArrayList<TaggedWord>();
        int start2 = 0;
        while (start2 < length) {
            String tag;
            int split = splitBacktrace[start2][length][nextPOS];
            StringBuffer wordBuf = new StringBuffer();
            for (int i = start2; i < split; ++i) {
                wordBuf.append(s.charAt(i));
            }
            String word = wordBuf.toString();
            tag = (String)tagNumberer.object(nextPOS);
            words.add(new TaggedWord(word, tag));
            if (split < length) {
                nextPOS = POSbacktrace[start2][length][nextPOS];
            }
            start2 = split;
        }
        return new Sentence((Collection<? extends HasWord>)words);
    }

    private Distribution getSegmentedWordLengthDistribution(Treebank tb) {
        CharacterLevelTagExtender ext = new CharacterLevelTagExtender();
        Counter<Integer> c = new Counter<Integer>();
        for (Tree gold : tb) {
            StringBuffer goldChars = new StringBuffer();
            Sentence goldYield = gold.yield();
            for (Word word : goldYield) {
                goldChars.append(word);
            }
            Sentence ourWords = this.segmentWords(goldChars.toString());
            for (int i = 0; i < ourWords.size(); ++i) {
                c.incrementCount(new Integer(((HasWord)ourWords.get(i)).toString().length()));
            }
        }
        Distribution wordLengthDist = Distribution.getDistribution(c);
        return wordLengthDist;
    }
}

