/*
 * Decompiled with CFR 0.152.
 */
package com.googlecode.clearnlp.tokenization;

import com.carrotsearch.hppc.ObjectIntOpenHashMap;
import com.googlecode.clearnlp.morphology.MPLib;
import com.googlecode.clearnlp.tokenization.AbstractTokenizer;
import com.googlecode.clearnlp.util.UTArray;
import com.googlecode.clearnlp.util.pair.IntIntPair;
import com.googlecode.clearnlp.util.pair.StringBooleanPair;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import jregex.MatchResult;
import jregex.Replacer;
import jregex.Substitution;
import jregex.TextBuffer;

public class EnglishTokenizer
extends AbstractTokenizer {
    protected final String F_DIR = "tokenize/";
    protected final String F_EMOTICONS = "tokenize/emoticons.txt";
    protected final String F_ABBREVIATIONS = "tokenize/abbreviations.txt";
    protected final String F_HYPHENS = "tokenize/hyphens.txt";
    protected final String F_COMPOUNDS = "tokenize/compounds.txt";
    protected final String F_UNITS = "tokenize/units.txt";
    protected final String F_MICROSOFT = "tokenize/microsoft.txt";
    protected final String S_DELIM = " ";
    protected final String S_PROTECTED = "PR0T_";
    protected final String S_D0D = "_DPPD_";
    protected final String S_HYPHEN = "_HYYN_";
    protected final String S_AMPERSAND = "_APSD_";
    protected final String S_APOSTROPHY = "_AOOR_";
    protected final int N_PROTECTED = "PR0T_".length();
    protected final Pattern P_DELIM = Pattern.compile(" ");
    protected final Pattern P_HYPHEN = Pattern.compile("-");
    protected final Pattern P_ABBREVIATION = Pattern.compile("^(\\p{Alpha}\\.)+\\p{Alpha}?$");
    protected final String[] A_D0D = new String[]{".", ",", ":", "-", "/", "'"};
    protected Replacer R_URL;
    protected Replacer R_ABBREVIATION;
    protected Replacer R_PERIOD_LIKE;
    protected Replacer R_MARKER;
    protected Replacer R_APOSTROPHY;
    protected Replacer R_USDOLLAR;
    protected Replacer R_AMPERSAND;
    protected Replacer R_WAW;
    protected Replacer R_PUNCTUATION_PRE;
    protected Replacer R_PUNCTUATION_POST;
    protected Replacer[] R_D0D;
    protected Replacer[] R_UNIT;
    protected Set<String> T_EMOTICONS;
    protected Set<String> T_ABBREVIATIONS;
    protected Pattern P_HYPHEN_LIST;
    protected ObjectIntOpenHashMap<String> M_D0D;
    protected ObjectIntOpenHashMap<String> M_COMPOUNDS;
    protected List<IntIntPair[]> L_COMPOUNDS;
    protected Pattern[] P_RECOVER_D0D;
    protected Pattern P_RECOVER_DOT;
    protected Pattern P_RECOVER_HYPHEN;
    protected Pattern P_RECOVER_APOSTROPHY;
    protected Pattern P_RECOVER_AMPERSAND;

    public EnglishTokenizer(ZipInputStream zin) {
        this.initReplacers();
        this.initMapsD0D();
        this.initPatterns();
        try {
            this.initDictionaries(zin);
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    @Override
    public List<StringBooleanPair> getTokenList(String str) {
        List<StringBooleanPair> lTokens = this.tokenizeWhiteSpaces(str);
        this.protectEmoticons(lTokens);
        lTokens = this.tokenizePatterns(lTokens, this.R_URL);
        lTokens = this.tokenizePatterns(lTokens, this.R_ABBREVIATION);
        lTokens = this.tokenizePatterns(lTokens, this.R_PERIOD_LIKE);
        lTokens = this.tokenizePatterns(lTokens, this.R_MARKER);
        lTokens = this.tokenizePatterns(lTokens, this.R_USDOLLAR);
        for (Replacer r : this.R_D0D) {
            this.replaceProtects(lTokens, r);
        }
        this.replaceHyphens(lTokens);
        lTokens = this.tokenizePatterns(lTokens, this.R_PUNCTUATION_PRE);
        this.protectAbbreviations(lTokens);
        this.protectFilenames(lTokens);
        lTokens = this.tokenizeCompounds(lTokens);
        lTokens = this.tokenizePatterns(lTokens, this.R_APOSTROPHY);
        this.replaceProtects(lTokens, this.R_AMPERSAND);
        this.replaceProtects(lTokens, this.R_WAW);
        for (Replacer r : this.R_UNIT) {
            lTokens = this.tokenizePatterns(lTokens, r);
        }
        if (this.b_twit) {
            this.protectTwits(lTokens);
        }
        lTokens = this.tokenizePatterns(lTokens, this.R_PUNCTUATION_POST);
        int size = this.P_RECOVER_D0D.length;
        for (int i = 0; i < size; ++i) {
            this.recoverPatterns(lTokens, this.P_RECOVER_D0D[i], this.A_D0D[i]);
        }
        this.recoverPatterns(lTokens, this.P_RECOVER_HYPHEN, "-");
        this.recoverPatterns(lTokens, this.P_RECOVER_APOSTROPHY, "'");
        this.recoverPatterns(lTokens, this.P_RECOVER_AMPERSAND, "&");
        return lTokens;
    }

    private void initReplacers() {
        this.R_URL = MPLib.URL_SPAN.replacer((Substitution)new SubstitutionOne());
        this.R_ABBREVIATION = new jregex.Pattern("(^(\\p{Alpha}\\.)+)(\\p{Punct}*$)").replacer((Substitution)new SubstitutionOnePlus());
        this.R_PERIOD_LIKE = new jregex.Pattern("(\\.|\\?|\\!){2,}").replacer((Substitution)new SubstitutionOne());
        this.R_MARKER = new jregex.Pattern("\\-{2,}|\\*{2,}|\\={2,}|\\~{2,}|\\,{2,}|\\`{2,}|\\'{2,}").replacer((Substitution)new SubstitutionOne());
        this.R_APOSTROPHY = new jregex.Pattern("(?i)((\\')(s|d|m|z|ll|re|ve|nt)|n(\\')t)$").replacer((Substitution)new SubstitutionOne());
        this.R_USDOLLAR = new jregex.Pattern("^US\\$").replacer((Substitution)new SubstitutionOne());
        this.R_AMPERSAND = this.getReplacerAmpersand();
        this.R_WAW = this.getReplacerWAWs();
        this.R_PUNCTUATION_PRE = new jregex.Pattern("\\(|\\)|\\[|\\]|\\{|\\}|<|>|\\,|\\:|\\;|\\\"").replacer((Substitution)new SubstitutionOne());
        this.R_PUNCTUATION_POST = new jregex.Pattern("\\.|\\?|\\!|\\`|\\'|\\-|\\/|\\@|\\#|\\$|\\%|\\&|\\|").replacer((Substitution)new SubstitutionOne());
        this.initReplacersD0Ds();
    }

    private Replacer getReplacerAmpersand() {
        return new jregex.Pattern("(\\p{Upper})(\\&)(\\p{Upper})").replacer(new Substitution(){

            public void appendSubstitution(MatchResult match, TextBuffer dest) {
                dest.append(match.group(1));
                dest.append("_APSD_");
                dest.append(match.group(3));
            }
        });
    }

    private Replacer getReplacerWAWs() {
        return new jregex.Pattern("(\\w)(\\')(\\w)").replacer(new Substitution(){

            public void appendSubstitution(MatchResult match, TextBuffer dest) {
                dest.append(match.group(1));
                dest.append("_AOOR_");
                dest.append(match.group(3));
            }
        });
    }

    private void initReplacersD0Ds() {
        String[] regex = new String[]{"(^|\\p{Alnum})(\\.)(\\d)", "(\\d)(,|:|-|\\/)(\\d)", "(^)(\\')(\\d)", "(\\d)(\\')(s)"};
        int size = regex.length;
        this.R_D0D = new Replacer[size];
        for (int i = 0; i < size; ++i) {
            this.R_D0D[i] = new jregex.Pattern(regex[i]).replacer((Substitution)new SubstitutionD0D());
        }
    }

    private void initMapsD0D() {
        this.M_D0D = new ObjectIntOpenHashMap();
        int size = this.A_D0D.length;
        for (int i = 0; i < size; ++i) {
            this.M_D0D.put((Object)this.A_D0D[i], i);
        }
    }

    private void initPatterns() {
        int size = this.A_D0D.length;
        this.P_RECOVER_D0D = new Pattern[size];
        for (int i = 0; i < size; ++i) {
            this.P_RECOVER_D0D[i] = Pattern.compile("_DPPD_" + i + "_");
        }
        this.P_RECOVER_HYPHEN = Pattern.compile("_HYYN_");
        this.P_RECOVER_APOSTROPHY = Pattern.compile("_AOOR_");
        this.P_RECOVER_AMPERSAND = Pattern.compile("_APSD_");
    }

    private void initDictionaries(ZipInputStream zin) throws Exception {
        ZipEntry zEntry;
        while ((zEntry = zin.getNextEntry()) != null) {
            String filename = zEntry.getName();
            if (filename.equals("tokenize/emoticons.txt")) {
                this.T_EMOTICONS = this.getSet(zin);
                continue;
            }
            if (filename.equals("tokenize/abbreviations.txt")) {
                this.T_ABBREVIATIONS = this.getSet(zin);
                continue;
            }
            if (filename.equals("tokenize/hyphens.txt")) {
                this.P_HYPHEN_LIST = this.getHyphenPatterns(zin);
                continue;
            }
            if (filename.equals("tokenize/compounds.txt")) {
                this.initDictionariesComounds(zin);
                continue;
            }
            if (!filename.equals("tokenize/units.txt")) continue;
            this.initDictionariesUnits(zin);
        }
        zin.close();
    }

    private Set<String> getSet(ZipInputStream zin) throws Exception {
        String line;
        BufferedReader fin = new BufferedReader(new InputStreamReader(zin));
        HashSet<String> set = new HashSet<String>();
        while ((line = fin.readLine()) != null) {
            set.add(line.trim());
        }
        return set;
    }

    private Pattern getHyphenPatterns(ZipInputStream zin) throws Exception {
        String line;
        BufferedReader fin = new BufferedReader(new InputStreamReader(zin));
        StringBuilder build = new StringBuilder();
        while ((line = fin.readLine()) != null) {
            build.append("|");
            build.append(line.trim());
        }
        return Pattern.compile(build.substring(1));
    }

    private void initDictionariesComounds(ZipInputStream zin) throws Exception {
        String line;
        BufferedReader fin = new BufferedReader(new InputStreamReader(zin));
        this.M_COMPOUNDS = new ObjectIntOpenHashMap();
        this.L_COMPOUNDS = new ArrayList<IntIntPair[]>();
        int i = 1;
        while ((line = fin.readLine()) != null) {
            Object[] tmp = this.P_DELIM.split(line.trim());
            int len = tmp.length;
            IntIntPair[] p = new IntIntPair[len];
            this.M_COMPOUNDS.put((Object)UTArray.join(tmp, ""), i);
            this.L_COMPOUNDS.add(p);
            int bIdx = 0;
            for (int j = 0; j < len; ++j) {
                int eIdx = bIdx + ((String)tmp[j]).length();
                p[j] = new IntIntPair(bIdx, eIdx);
                bIdx = eIdx;
            }
            ++i;
        }
    }

    private void initDictionariesUnits(ZipInputStream zin) throws Exception {
        BufferedReader fin = new BufferedReader(new InputStreamReader(zin));
        String signs = fin.readLine().trim();
        String currencies = fin.readLine().trim();
        String units = fin.readLine().trim();
        this.R_UNIT = new Replacer[4];
        this.R_UNIT[0] = new jregex.Pattern("^(?i)(\\p{Punct}*" + signs + ")(\\d)").replacer((Substitution)new SubstitutionTwo());
        this.R_UNIT[1] = new jregex.Pattern("^(?i)(\\p{Punct}*" + currencies + ")(\\d)").replacer((Substitution)new SubstitutionTwo());
        this.R_UNIT[2] = new jregex.Pattern("(?i)(\\d)(" + currencies + "\\p{Punct}*)$").replacer((Substitution)new SubstitutionTwo());
        this.R_UNIT[3] = new jregex.Pattern("(?i)(\\d)(" + units + "\\p{Punct}*)$").replacer((Substitution)new SubstitutionTwo());
    }

    protected List<StringBooleanPair> tokenizeWhiteSpaces(String str) {
        ArrayList<StringBooleanPair> tokens = new ArrayList<StringBooleanPair>();
        for (String token : MPLib.splitWhiteSpaces(str)) {
            tokens.add(new StringBooleanPair(token, false));
        }
        return tokens;
    }

    protected void protectTwits(List<StringBooleanPair> tokens) {
        for (StringBooleanPair token : tokens) {
            char c = token.s.charAt(0);
            if (c != '@' && c != '#' || !MPLib.isAlnum(token.s.substring(1))) continue;
            token.b = true;
        }
    }

    protected void protectEmoticons(List<StringBooleanPair> tokens) {
        for (StringBooleanPair token : tokens) {
            if (!this.T_EMOTICONS.contains(token.s)) continue;
            token.b = true;
        }
    }

    protected void protectAbbreviations(List<StringBooleanPair> tokens) {
        for (StringBooleanPair token : tokens) {
            String lower = token.s.toLowerCase();
            if (!this.T_ABBREVIATIONS.contains(lower) && !this.P_ABBREVIATION.matcher(lower).find()) continue;
            token.b = true;
        }
    }

    protected void protectFilenames(List<StringBooleanPair> tokens) {
        for (StringBooleanPair token : tokens) {
            String lower = token.s.toLowerCase();
            if (!MPLib.FILE_EXTS.matcher(lower).find()) continue;
            token.b = true;
        }
    }

    protected void replaceProtects(List<StringBooleanPair> tokens, Replacer rep) {
        for (StringBooleanPair token : tokens) {
            if (token.b) continue;
            token.s = rep.replace(token.s);
        }
    }

    protected void replaceHyphens(List<StringBooleanPair> tokens) {
        for (StringBooleanPair token : tokens) {
            if (token.b || !this.P_HYPHEN_LIST.matcher(token.s.toLowerCase()).find()) continue;
            token.s = this.P_HYPHEN.matcher(token.s).replaceAll("_HYYN_");
        }
    }

    protected void recoverPatterns(List<StringBooleanPair> tokens, Pattern p, String replacement) {
        for (StringBooleanPair token : tokens) {
            token.s = p.matcher(token.s).replaceAll(replacement);
        }
    }

    protected List<StringBooleanPair> tokenizeCompounds(List<StringBooleanPair> oTokens) {
        ArrayList<StringBooleanPair> nTokens = new ArrayList<StringBooleanPair>();
        for (StringBooleanPair oToken : oTokens) {
            int idx;
            if (oToken.b || (idx = this.M_COMPOUNDS.get((Object)oToken.s.toLowerCase()) - 1) < 0) {
                nTokens.add(oToken);
                continue;
            }
            for (IntIntPair p : this.L_COMPOUNDS.get(idx)) {
                nTokens.add(new StringBooleanPair(oToken.s.substring(p.i1, p.i2), true));
            }
        }
        return nTokens;
    }

    protected List<StringBooleanPair> tokenizePatterns(List<StringBooleanPair> oTokens, Replacer rep) {
        ArrayList<StringBooleanPair> nTokens = new ArrayList<StringBooleanPair>();
        for (StringBooleanPair oToken : oTokens) {
            if (oToken.b) {
                nTokens.add(oToken);
                continue;
            }
            this.tokenizePatternsAux(nTokens, rep, oToken.s);
        }
        return nTokens;
    }

    private void tokenizePatternsAux(List<StringBooleanPair> tokens, Replacer rep, String str) {
        for (String token : this.P_DELIM.split(rep.replace(str).trim())) {
            if (token.startsWith("PR0T_")) {
                tokens.add(new StringBooleanPair(token.substring(this.N_PROTECTED), true));
                continue;
            }
            if (token.isEmpty()) continue;
            tokens.add(new StringBooleanPair(token, false));
        }
    }

    private class SubstitutionOnePlus
    implements Substitution {
        private SubstitutionOnePlus() {
        }

        public void appendSubstitution(MatchResult match, TextBuffer dest) {
            dest.append(" ");
            dest.append("PR0T_");
            dest.append(match.group(1));
            dest.append(" ");
            dest.append(match.group(3));
        }
    }

    private class SubstitutionD0D
    implements Substitution {
        private SubstitutionD0D() {
        }

        public void appendSubstitution(MatchResult match, TextBuffer dest) {
            dest.append(match.group(1));
            dest.append("_DPPD_" + EnglishTokenizer.this.M_D0D.get((Object)match.group(2)) + "_");
            dest.append(match.group(3));
        }
    }

    private class SubstitutionTwo
    implements Substitution {
        private SubstitutionTwo() {
        }

        public void appendSubstitution(MatchResult match, TextBuffer dest) {
            dest.append(match.group(1));
            dest.append(" ");
            dest.append(match.group(2));
        }
    }

    private class SubstitutionOne
    implements Substitution {
        private SubstitutionOne() {
        }

        public void appendSubstitution(MatchResult match, TextBuffer dest) {
            dest.append(" ");
            dest.append("PR0T_");
            dest.append(match.group(0));
            dest.append(" ");
        }
    }
}

