/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.io.NumberRangeFileFilter;
import edu.stanford.nlp.io.NumberRangesFileFilter;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.parser.lexparser.ChineseLexicon;
import edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams;
import edu.stanford.nlp.parser.lexparser.CollinsPuncTransformer;
import edu.stanford.nlp.parser.lexparser.IntTaggedWord;
import edu.stanford.nlp.parser.lexparser.LeftHeadFinder;
import edu.stanford.nlp.parser.lexparser.Lexicon;
import edu.stanford.nlp.parser.lexparser.Options;
import edu.stanford.nlp.parser.lexparser.ParentAnnotationStats;
import edu.stanford.nlp.parser.lexparser.Test;
import edu.stanford.nlp.parser.lexparser.Train;
import edu.stanford.nlp.parser.lexparser.TreeAnnotator;
import edu.stanford.nlp.parser.lexparser.TreeAnnotatorAndBinarizer;
import edu.stanford.nlp.parser.lexparser.TreebankLangParserParams;
import edu.stanford.nlp.parser.lexparser.WordSegmenter;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.Function;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.process.WordSegmentingTokenizer;
import edu.stanford.nlp.trees.MemoryTreebank;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreePrint;
import edu.stanford.nlp.trees.Treebank;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.international.pennchinese.ChineseTreebankLanguagePack;
import edu.stanford.nlp.util.Timing;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InvalidClassException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.StreamCorruptedException;
import java.io.Writer;
import java.net.URL;
import java.net.URLConnection;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

public class ChineseLexiconAndWordSegmenter
implements Lexicon,
WordSegmenter {
    private final ChineseLexicon chineseLexicon;
    private final WordSegmenter wordSegmenter;
    private Options op;

    public ChineseLexiconAndWordSegmenter(ChineseLexicon lex, WordSegmenter seg) {
        this.chineseLexicon = lex;
        this.wordSegmenter = seg;
        ChineseTreebankLanguagePack.setTokenizerFactory(WordSegmentingTokenizer.factory(seg));
    }

    public Sentence segmentWords(String s) {
        return this.wordSegmenter.segmentWords(s);
    }

    public boolean isKnown(int word) {
        return this.chineseLexicon.isKnown(word);
    }

    public boolean isKnown(String word) {
        return this.chineseLexicon.isKnown(word);
    }

    public Iterator ruleIteratorByWord(int word, int loc) {
        return this.chineseLexicon.ruleIteratorByWord(word, loc);
    }

    public void train(Collection trees) {
        this.chineseLexicon.train(trees);
        this.wordSegmenter.train(trees);
    }

    public float score(IntTaggedWord iTW, int loc) {
        return this.chineseLexicon.score(iTW, loc);
    }

    public void readData(BufferedReader in) throws IOException {
        this.chineseLexicon.readData(in);
    }

    public void writeData(Writer w) throws IOException {
        this.chineseLexicon.writeData(w);
    }

    private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
        in.defaultReadObject();
        ChineseTreebankLanguagePack.setTokenizerFactory(WordSegmentingTokenizer.factory(this.wordSegmenter));
    }

    private static int numSubArgs(String[] args, int index) {
        int i = index;
        while (i + 1 < args.length && args[i + 1].charAt(0) != '-') {
            ++i;
        }
        return i - index;
    }

    public ChineseLexiconAndWordSegmenter(Treebank trainTreebank, Options op) {
        ChineseLexiconAndWordSegmenter cs = this.getSegmenterDataFromTreebank(trainTreebank, op);
        this.chineseLexicon = cs.chineseLexicon;
        this.wordSegmenter = cs.wordSegmenter;
    }

    private ChineseLexiconAndWordSegmenter getSegmenterDataFromTreebank(Treebank trainTreebank, Options op) {
        System.out.println("Currently " + new Date());
        Timing.startTime();
        TreebankLangParserParams tlpParams = op.tlpParams;
        if (Test.verbose) {
            System.out.print("Training ");
            System.out.println(trainTreebank.textualSummary());
        }
        System.out.print("Binarizing trees...");
        TreeAnnotatorAndBinarizer binarizer = null;
        binarizer = !Train.leftToRight ? new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !Train.outsideFactor(), true) : new TreeAnnotatorAndBinarizer(tlpParams.headFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !Train.outsideFactor(), true);
        CollinsPuncTransformer collinsPuncTransformer = null;
        if (Train.collinsPunc) {
            collinsPuncTransformer = new CollinsPuncTransformer(tlpParams.treebankLanguagePack());
        }
        ArrayList<Tree> binaryTrainTrees = new ArrayList<Tree>();
        ArrayList binaryTuneTrees = new ArrayList();
        if (Train.selectiveSplit) {
            Train.splitters = ParentAnnotationStats.getSplitCategories(trainTreebank, true, 0, Train.selectiveSplitCutOff, Train.tagSelectiveSplitCutOff, tlpParams.treebankLanguagePack());
            if (Test.verbose) {
                System.err.println("Parent split categories: " + Train.splitters);
            }
        }
        if (Train.selectivePostSplit) {
            TreeAnnotator myTransformer = new TreeAnnotator(tlpParams.headFinder(), tlpParams);
            Treebank annotatedTB = trainTreebank.transform(myTransformer);
            Train.postSplitters = ParentAnnotationStats.getSplitCategories(annotatedTB, true, 0, Train.selectivePostSplitCutOff, Train.tagSelectivePostSplitCutOff, tlpParams.treebankLanguagePack());
            if (Test.verbose) {
                System.err.println("Parent post annotation split categories: " + Train.postSplitters);
            }
        }
        if (Train.hSelSplit) {
            binarizer.setDoSelectiveSplit(false);
            for (Tree tree : trainTreebank) {
                if (Train.collinsPunc) {
                    tree = collinsPuncTransformer.transformTree(tree);
                }
                tree = binarizer.transformTree(tree);
            }
            binarizer.setDoSelectiveSplit(true);
        }
        for (Tree tree : trainTreebank) {
            if (Train.collinsPunc) {
                tree = collinsPuncTransformer.transformTree(tree);
            }
            tree = binarizer.transformTree(tree);
            binaryTrainTrees.add(tree);
        }
        Timing.tick("done.");
        if (Test.verbose) {
            binarizer.dumpStats();
        }
        System.out.print("Extracting Lexicon...");
        ChineseLexiconAndWordSegmenter clex = (ChineseLexiconAndWordSegmenter)op.tlpParams.lex(op.lexOptions);
        clex.train(binaryTrainTrees);
        Timing.tick("done.");
        return clex;
    }

    private static void printArgs(String[] args, PrintStream ps) {
        ps.print("ChineseLexiconAndWordSegmenter invoked with arguments:");
        for (int i = 0; i < args.length; ++i) {
            ps.print(" " + args[i]);
        }
        ps.println();
    }

    static void saveSegmenterDataToSerialized(ChineseLexiconAndWordSegmenter cs, String filename) {
        try {
            System.err.print("Writing segmenter in serialized format to file " + filename + " ");
            ObjectOutputStream out2 = filename.endsWith(".gz") ? new ObjectOutputStream(new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(filename)))) : new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
            out2.writeObject(cs);
            out2.close();
            System.err.println("done.");
        }
        catch (IOException ioe) {
            ioe.printStackTrace();
        }
    }

    static void saveSegmenterDataToText(ChineseLexiconAndWordSegmenter cs, String filename) {
        try {
            System.err.print("Writing parser in text grammar format to file " + filename);
            BufferedOutputStream os = filename.endsWith(".gz") ? new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(filename))) : new BufferedOutputStream(new FileOutputStream(filename));
            PrintWriter out2 = new PrintWriter(os);
            String prefix = "BEGIN ";
            out2.println(prefix + "LEXICON");
            if (cs != null) {
                cs.writeData(out2);
            }
            out2.println();
            System.err.print(".");
            out2.flush();
            out2.close();
            System.err.println("done.");
        }
        catch (IOException e) {
            System.err.println("Trouble saving segmenter data to ASCII format.");
            e.printStackTrace();
        }
    }

    private static Treebank makeTreebank(String treebankPath, Options op, FileFilter filt) {
        System.err.println("Training a segmenter from treebank dir: " + treebankPath);
        MemoryTreebank trainTreebank = op.tlpParams.memoryTreebank();
        System.err.print("Reading trees...");
        if (filt == null) {
            trainTreebank.loadPath(treebankPath);
        } else {
            trainTreebank.loadPath(treebankPath, filt);
        }
        Timing.tick("done [read " + ((Treebank)trainTreebank).size() + " trees].");
        return trainTreebank;
    }

    public ChineseLexiconAndWordSegmenter(String segmenterFileOrUrl, Options op) {
        ChineseLexiconAndWordSegmenter cs = ChineseLexiconAndWordSegmenter.getSegmenterDataFromFile(segmenterFileOrUrl, op);
        this.op = cs.op;
        this.chineseLexicon = cs.chineseLexicon;
        this.wordSegmenter = cs.wordSegmenter;
    }

    public static ChineseLexiconAndWordSegmenter getSegmenterDataFromFile(String parserFileOrUrl, Options op) {
        ChineseLexiconAndWordSegmenter cs = ChineseLexiconAndWordSegmenter.getSegmenterDataFromSerializedFile(parserFileOrUrl);
        if (cs == null) {
            // empty if block
        }
        return cs;
    }

    protected static ChineseLexiconAndWordSegmenter getSegmenterDataFromSerializedFile(String serializedFileOrUrl) {
        ChineseLexiconAndWordSegmenter cs = null;
        try {
            InputStream is;
            System.err.print("Loading segmenter from serialized file " + serializedFileOrUrl + " ...");
            if (serializedFileOrUrl.startsWith("http://")) {
                URL u = new URL(serializedFileOrUrl);
                URLConnection uc = u.openConnection();
                is = uc.getInputStream();
            } else {
                is = new FileInputStream(serializedFileOrUrl);
            }
            ObjectInputStream in = serializedFileOrUrl.endsWith(".gz") ? new ObjectInputStream(new BufferedInputStream(new GZIPInputStream(is))) : new ObjectInputStream(new BufferedInputStream(is));
            cs = (ChineseLexiconAndWordSegmenter)in.readObject();
            in.close();
            System.err.println(" done.");
            return cs;
        }
        catch (InvalidClassException ice) {
            System.err.println();
            ice.printStackTrace();
            System.exit(2);
        }
        catch (FileNotFoundException fnfe) {
            System.err.println();
            fnfe.printStackTrace();
            System.exit(2);
        }
        catch (StreamCorruptedException sce) {
        }
        catch (Exception e) {
            System.err.println();
            e.printStackTrace();
        }
        return null;
    }

    public static void main(String[] args) {
        boolean train = false;
        boolean saveToSerializedFile = false;
        boolean saveToTextFile = false;
        String serializedInputFileOrUrl = null;
        Object textInputFileOrUrl = null;
        String serializedOutputFileOrUrl = null;
        String textOutputFileOrUrl = null;
        String treebankPath = null;
        MemoryTreebank testTreebank = null;
        Object tuneTreebank = null;
        String testPath = null;
        FileFilter testFilter = null;
        FileFilter trainFilter = null;
        String encoding = null;
        TokenizerFactory tokenizerFactory = null;
        DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor();
        boolean tokenized = false;
        Function<List<HasWord>, List<HasWord>> escaper = null;
        int tagDelimiter = -1;
        String sentenceDelimiter = "\n";
        boolean fromXML = false;
        int argIndex = 0;
        if (args.length < 1) {
            System.err.println("usage: java edu.stanford.nlp.parser.lexparser.LexicalizedParser parserFileOrUrl filename*");
            System.exit(1);
        }
        Options op = new Options();
        op.tlpParams = new ChineseTreebankParserParams();
        while (argIndex < args.length && args[argIndex].charAt(0) == '-') {
            int high;
            int low;
            int numSubArgs;
            if (args[argIndex].equalsIgnoreCase("-train")) {
                train = true;
                saveToSerializedFile = true;
                numSubArgs = ChineseLexiconAndWordSegmenter.numSubArgs(args, argIndex);
                ++argIndex;
                if (numSubArgs > 1) {
                    treebankPath = args[argIndex];
                    ++argIndex;
                } else {
                    throw new RuntimeException("Error: -train option must have treebankPath as first argument.");
                }
                if (numSubArgs == 2) {
                    trainFilter = new NumberRangesFileFilter(args[argIndex++], true);
                    continue;
                }
                if (numSubArgs < 3) continue;
                try {
                    low = Integer.parseInt(args[argIndex]);
                    high = Integer.parseInt(args[argIndex + 1]);
                    trainFilter = new NumberRangeFileFilter(low, high, true);
                    argIndex += 2;
                }
                catch (NumberFormatException e) {
                    trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                    ++argIndex;
                }
                continue;
            }
            if (args[argIndex].equalsIgnoreCase("-encoding")) {
                encoding = args[argIndex + 1];
                op.tlpParams.setInputEncoding(encoding);
                op.tlpParams.setOutputEncoding(encoding);
                argIndex += 2;
                continue;
            }
            if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile")) {
                serializedInputFileOrUrl = args[argIndex + 1];
                argIndex += 2;
                continue;
            }
            if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {
                saveToSerializedFile = true;
                serializedOutputFileOrUrl = args[argIndex + 1];
                argIndex += 2;
                continue;
            }
            if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {
                saveToTextFile = true;
                textOutputFileOrUrl = args[argIndex + 1];
                argIndex += 2;
                continue;
            }
            if (args[argIndex].equalsIgnoreCase("-treebank")) {
                numSubArgs = ChineseLexiconAndWordSegmenter.numSubArgs(args, argIndex);
                ++argIndex;
                if (numSubArgs == 1) {
                    testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                    continue;
                }
                if (numSubArgs <= 1) continue;
                testPath = args[argIndex++];
                if (numSubArgs == 2) {
                    testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                    continue;
                }
                if (numSubArgs < 3) continue;
                try {
                    low = Integer.parseInt(args[argIndex]);
                    high = Integer.parseInt(args[argIndex + 1]);
                    testFilter = new NumberRangeFileFilter(low, high, true);
                    argIndex += 2;
                }
                catch (NumberFormatException e) {
                    testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                }
                continue;
            }
            int j = op.tlpParams.setOptionFlag(args, argIndex);
            if (j == argIndex) {
                System.err.println("Unknown option ignored: " + args[argIndex]);
            }
            argIndex = ++j;
        }
        TreebankLangParserParams tlpParams = op.tlpParams;
        ChineseLexiconAndWordSegmenter cs = null;
        if (!train && Test.verbose) {
            System.out.println("Currently " + new Date());
            ChineseLexiconAndWordSegmenter.printArgs(args, System.out);
        }
        if (train) {
            ChineseLexiconAndWordSegmenter.printArgs(args, System.out);
            if (treebankPath == null) {
                treebankPath = args[argIndex];
                if (args.length > ++argIndex + 1) {
                    try {
                        int low = Integer.parseInt(args[argIndex]);
                        int high = Integer.parseInt(args[argIndex + 1]);
                        trainFilter = new NumberRangeFileFilter(low, high, true);
                        argIndex += 2;
                    }
                    catch (NumberFormatException e) {
                        trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                        ++argIndex;
                    }
                }
            }
            Treebank trainTreebank = ChineseLexiconAndWordSegmenter.makeTreebank(treebankPath, op, trainFilter);
            cs = new ChineseLexiconAndWordSegmenter(trainTreebank, op);
        } else if (textInputFileOrUrl == null) {
            if (serializedInputFileOrUrl == null) {
                serializedInputFileOrUrl = args[argIndex];
                ++argIndex;
            }
            try {
                cs = new ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op);
            }
            catch (IllegalArgumentException e) {
                System.err.println("Error loading segmenter, exiting...");
                System.exit(0);
            }
        }
        TreePrint treePrint = Test.treePrint(tlpParams);
        if (testFilter != null) {
            if (testPath == null) {
                if (treebankPath == null) {
                    throw new RuntimeException("No test treebank path specified...");
                }
                System.err.println("No test treebank path specified.  Using train path: \"" + treebankPath + "\"");
                testPath = treebankPath;
            }
            testTreebank = tlpParams.testMemoryTreebank();
            testTreebank.loadPath(testPath, testFilter);
        }
        Train.sisterSplitters = new HashSet<String>(Arrays.asList(tlpParams.sisterSplitters()));
        if (Test.verbose) {
            System.err.println("Lexicon is " + cs.getClass().getName());
        }
        PrintWriter pwOut = tlpParams.pw();
        PrintWriter pwErr = tlpParams.pw(System.err);
        if (saveToTextFile) {
            if (textOutputFileOrUrl != null) {
                ChineseLexiconAndWordSegmenter.saveSegmenterDataToText(cs, textOutputFileOrUrl);
            } else {
                System.err.println("Usage: must specify a text segmenter data output path");
            }
        }
        if (saveToSerializedFile) {
            if (serializedOutputFileOrUrl == null && argIndex < args.length) {
                serializedOutputFileOrUrl = args[argIndex];
                ++argIndex;
            }
            if (serializedOutputFileOrUrl != null) {
                ChineseLexiconAndWordSegmenter.saveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl);
            } else if (textOutputFileOrUrl == null && testTreebank == null) {
                System.err.println("usage: java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter-train trainFilesPath [start stop] serializedParserFilename");
            }
        }
        if (Test.verbose) {
            // empty if block
        }
        if (testTreebank != null || argIndex < args.length && args[argIndex].equalsIgnoreCase("-treebank")) {
            if (testTreebank == null) {
                testTreebank = tlpParams.testMemoryTreebank();
                if (args.length < argIndex + 4) {
                    testTreebank.loadPath(args[argIndex + 1]);
                } else {
                    int testlow = Integer.parseInt(args[argIndex + 2]);
                    int testhigh = Integer.parseInt(args[argIndex + 3]);
                    testTreebank.loadPath(args[argIndex + 1], (FileFilter)new NumberRangeFileFilter(testlow, testhigh, true));
                }
            }
        } else {
            int numWords = 0;
            Timing timer = new Timing();
            if (tokenized) {
                tokenizerFactory = WhitespaceTokenizer.factory();
            }
            TreebankLanguagePack tlp = tlpParams.treebankLanguagePack();
            if (tokenizerFactory == null) {
                tokenizerFactory = tlp.getTokenizerFactory();
            }
            documentPreprocessor.setTokenizerFactory(tokenizerFactory);
            documentPreprocessor.setSentenceFinalPuncWords(tlp.sentenceFinalPunctuationWords());
            if (encoding != null) {
                documentPreprocessor.setEncoding(encoding);
            }
            timer.start();
            for (int i = argIndex; i < args.length; ++i) {
                String filename = args[i];
                try {
                    List<List<? extends HasWord>> document = null;
                    document = fromXML ? documentPreprocessor.getSentencesFromXML(filename, sentenceDelimiter, tokenized) : documentPreprocessor.getSentencesFromText(filename, escaper, sentenceDelimiter, tagDelimiter);
                    System.err.println("Segmenting file: " + filename + " with " + document.size() + " sentences.");
                    PrintWriter pwo = pwOut;
                    if (Test.writeOutputFiles) {
                        try {
                            pwo = tlpParams.pw(new FileOutputStream(filename + ".stp"));
                        }
                        catch (IOException ioe) {
                            ioe.printStackTrace();
                        }
                    }
                    int num = 0;
                    treePrint.printHeader(pwo, tlp.getEncoding());
                    Iterator<List<? extends HasWord>> it = document.iterator();
                    while (it.hasNext()) {
                        ++num;
                        List<? extends HasWord> sentence = it.next();
                        int len = sentence.size();
                        numWords += len;
                        pwo.println(new Sentence((Collection<? extends HasWord>)sentence));
                    }
                    treePrint.printFooter(pwo);
                    if (!Test.writeOutputFiles) continue;
                    pwo.close();
                    continue;
                }
                catch (IOException e) {
                    pwErr.println("Couldn't find file: " + filename);
                }
            }
            long millis = timer.stop();
            double wordspersec = (double)numWords / ((double)millis / 1000.0);
            DecimalFormat nf = new DecimalFormat("0.00");
            pwErr.println("Segmented " + numWords + " words at " + nf.format(wordspersec) + " words per second.");
        }
    }
}

