package weka.filters.unsupervised.attribute;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import weka.core.Capabilities;
import weka.core.DictionaryBuilder;
import weka.core.Environment;
import weka.core.EnvironmentHandler;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.OptionMetadata;
import weka.core.WeightedInstancesHandler;
import weka.core.stemmers.NullStemmer;
import weka.core.stemmers.Stemmer;
import weka.core.stopwords.Null;
import weka.core.stopwords.StopwordsHandler;
import weka.core.tokenizers.Tokenizer;
import weka.filters.SimpleStreamFilter;
import weka.filters.UnsupervisedFilter;
import weka.gui.FilePropertyMetadata;

/* loaded from: classes2.dex */
public class FixedDictionaryStringToWordVector extends SimpleStreamFilter implements UnsupervisedFilter, EnvironmentHandler, WeightedInstancesHandler {
    private static final long serialVersionUID = 7990892846966916757L;
    protected boolean m_dictionaryIsBinary;
    protected transient InputStream m_dictionarySource;
    protected transient Reader m_textDictionarySource;
    protected DictionaryBuilder m_vectorizer = new DictionaryBuilder();
    protected File m_dictionaryFile = new File("-- set me --");
    protected transient Environment m_env = Environment.getSystemWide();

    public static void main(String[] strArr) {
        runFilter(new FixedDictionaryStringToWordVector(), strArr);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // weka.filters.SimpleStreamFilter, weka.filters.SimpleFilter
    public Instances determineOutputFormat(Instances instances) throws Exception {
        if (this.m_vectorizer.readyToVectorize() && instances.equalHeaders(this.m_vectorizer.getInputFormat())) {
            return this.m_vectorizer.getVectorizedFormat();
        }
        this.m_vectorizer.reset();
        this.m_vectorizer.setup(instances);
        if (this.m_dictionaryFile == null && this.m_dictionarySource == null && this.m_textDictionarySource == null) {
            throw new IOException("No dictionary file/source specified!");
        }
        InputStream inputStream = this.m_dictionarySource;
        if (inputStream != null) {
            this.m_vectorizer.loadDictionary(inputStream);
        } else {
            Reader reader = this.m_textDictionarySource;
            if (reader != null) {
                this.m_vectorizer.loadDictionary(reader);
            } else {
                String file = this.m_dictionaryFile.toString();
                if (file.length() == 0) {
                    throw new IOException("No dictionary file specified!");
                }
                try {
                    file = this.m_env.substitute(file);
                } catch (Exception unused) {
                }
                File file2 = new File(file);
                if (!file2.exists()) {
                    throw new IOException("Specified dictionary file '" + file + "' does not seem to exist!");
                }
                this.m_vectorizer.loadDictionary(file2, !this.m_dictionaryIsBinary);
            }
        }
        return this.m_vectorizer.getVectorizedFormat();
    }

    public String getAttributeIndices() {
        return this.m_vectorizer.getAttributeIndices();
    }

    public String getAttributeNamePrefix() {
        return this.m_vectorizer.getAttributeNamePrefix();
    }

    @Override // weka.filters.Filter, weka.core.CapabilitiesHandler
    public Capabilities getCapabilities() {
        Capabilities capabilities = super.getCapabilities();
        capabilities.disableAll();
        capabilities.enableAllAttributes();
        capabilities.enable(Capabilities.Capability.MISSING_VALUES);
        capabilities.enableAllClasses();
        capabilities.enable(Capabilities.Capability.MISSING_CLASS_VALUES);
        capabilities.enable(Capabilities.Capability.NO_CLASS);
        return capabilities;
    }

    public File getDictionaryFile() {
        return this.m_dictionaryFile;
    }

    public DictionaryBuilder getDictionaryHandler() {
        return this.m_vectorizer;
    }

    public boolean getDictionaryIsBinary() {
        return this.m_dictionaryIsBinary;
    }

    public boolean getIDFTransform() {
        return this.m_vectorizer.getIDFTransform();
    }

    public boolean getInvertSelection() {
        return this.m_vectorizer.getInvertSelection();
    }

    public boolean getLowerCaseTokens() {
        return this.m_vectorizer.getLowerCaseTokens();
    }

    public boolean getNormalizeDocLength() {
        return this.m_vectorizer.getNormalize();
    }

    public boolean getOutputWordCounts() {
        return this.m_vectorizer.getOutputWordCounts();
    }

    public Stemmer getStemmer() {
        return this.m_vectorizer.getStemmer();
    }

    public StopwordsHandler getStopwordsHandler() {
        return this.m_vectorizer.getStopwordsHandler();
    }

    public boolean getTFTransform() {
        return this.m_vectorizer.getTFTransform();
    }

    public Tokenizer getTokenizer() {
        return this.m_vectorizer.getTokenizer();
    }

    @Override // weka.filters.SimpleFilter
    public String globalInfo() {
        return "Converts String attributes into a set of attributes representing word occurrence (depending on the tokenizer) information from the text contained in the strings. The set of words (attributes) is taken from a user-supplied dictionary, either in plain text form or as a serialized java object.";
    }

    @Override // weka.filters.SimpleStreamFilter
    protected Instance process(Instance instance) throws Exception {
        return this.m_vectorizer.vectorizeInstance(instance);
    }

    @OptionMetadata(commandLineParamName = "R", commandLineParamSynopsis = "-R <range>", description = "Specify range of attributes to act on. This is a comma separated list of attribute\nindices, with \"first\" and \"last\" valid values.", displayName = "Range of attributes to operate on", displayOrder = 4)
    public void setAttributeIndices(String str) {
        this.m_vectorizer.setAttributeIndices(str);
    }

    @OptionMetadata(commandLineParamName = "P", commandLineParamSynopsis = "-P <attribute name prefix>", description = "Specify a prefix for the created attribute names (default: \"\")", displayName = "Prefix for created attribute names", displayOrder = 6)
    public void setAttributeNamePrefix(String str) {
        this.m_vectorizer.setAttributeNamePrefix(str);
    }

    @FilePropertyMetadata(directoriesOnly = false, fileChooserDialogType = 0)
    @OptionMetadata(commandLineParamName = "dictionary", commandLineParamSynopsis = "-dictionary <path to dictionary file>", description = "The path to the dictionary to use", displayName = "Dictionary file", displayOrder = 1)
    public void setDictionaryFile(File file) {
        this.m_dictionaryFile = file;
    }

    @OptionMetadata(commandLineParamIsFlag = true, commandLineParamName = "binary-dict", commandLineParamSynopsis = "-binary-dict", description = "Dictionary file contains a binary serialized dictionary", displayName = "Dictionary is binary", displayOrder = 2)
    public void setDictionaryIsBinary(boolean z) {
        this.m_dictionaryIsBinary = z;
    }

    public void setDictionarySource(InputStream inputStream) {
        this.m_dictionarySource = inputStream;
    }

    public void setDictionarySource(Reader reader) {
        this.m_textDictionarySource = reader;
    }

    @Override // weka.core.EnvironmentHandler
    public void setEnvironment(Environment environment) {
        this.m_env = environment;
    }

    @OptionMetadata(commandLineParamName = "I", commandLineParamSynopsis = "-I", description = "Set whether the word frequencies in a document should be transformed into\nfij*log(num of Docs/num of docs with word i), where fij is the frequency\nof word i in document (instance) j.", displayName = "IDF transform", displayOrder = 8)
    public void setIDFTransform(boolean z) {
        this.m_vectorizer.setIDFTransform(z);
    }

    @OptionMetadata(commandLineParamIsFlag = true, commandLineParamName = "V", commandLineParamSynopsis = "-V", description = "Set attributes selection mode. If false, only selected attributes in the range will\nbe worked on. If true, only non-selected attributes will be processed", displayName = "Invert selection", displayOrder = 5)
    public void setInvertSelection(boolean z) {
        this.m_vectorizer.setInvertSelection(z);
    }

    @OptionMetadata(commandLineParamIsFlag = true, commandLineParamName = "L", commandLineParamSynopsis = "-L", description = "Convert all tokens to lowercase when matching against dictionary entries.", displayName = "Lower case tokens", displayOrder = 10)
    public void setLowerCaseTokens(boolean z) {
        this.m_vectorizer.setLowerCaseTokens(z);
    }

    @OptionMetadata(commandLineParamIsFlag = true, commandLineParamName = "N", commandLineParamSynopsis = "-N", description = "Whether to normalize to average length of documents seen during dictionary construction", displayName = "Normalize word frequencies", displayOrder = 9)
    public void setNormalizeDocLength(boolean z) {
        this.m_vectorizer.setNormalize(z);
    }

    @OptionMetadata(commandLineParamIsFlag = true, commandLineParamName = "C", commandLineParamSynopsis = "-C", description = "Output word counts rather than boolean 0 or 1 (indicating presence or absence of a word", displayName = "Output word counts", displayOrder = 3)
    public void setOutputWordCounts(boolean z) {
        this.m_vectorizer.setOutputWordCounts(z);
    }

    @OptionMetadata(commandLineParamName = "stemmer", commandLineParamSynopsis = "-stemmer <spec>", description = "The stemming algorithm (classname plus parameters) to use.", displayName = "Stemmer to use", displayOrder = 11)
    public void setStemmer(Stemmer stemmer) {
        if (stemmer != null) {
            this.m_vectorizer.setStemmer(stemmer);
        } else {
            this.m_vectorizer.setStemmer(new NullStemmer());
        }
    }

    @OptionMetadata(commandLineParamName = "stopwords-handler", commandLineParamSynopsis = "-stopwords-handler <spec>", description = "The stopwords handler to use (default = Null)", displayName = "Stop words handler", displayOrder = 12)
    public void setStopwordsHandler(StopwordsHandler stopwordsHandler) {
        if (stopwordsHandler != null) {
            this.m_vectorizer.setStopwordsHandler(stopwordsHandler);
        } else {
            this.m_vectorizer.setStopwordsHandler(new Null());
        }
    }

    @OptionMetadata(commandLineParamName = "T", commandLineParamSynopsis = "-T", description = "Set whether the word frequencies should be transformed into\nlog(1+fij), where fij is the frequency of word i in document (instance) j.", displayName = "TFT transform", displayOrder = 7)
    public void setTFTransform(boolean z) {
        this.m_vectorizer.setTFTransform(z);
    }

    @OptionMetadata(commandLineParamName = "tokenizer", commandLineParamSynopsis = "-tokenizer <spec>", description = "The tokenizing algorithm (classname plus parameters) to use.\n(default: weka.core.tokenizers.WordTokenizer)", displayName = "Tokenizer", displayOrder = 13)
    public void setTokenizer(Tokenizer tokenizer) {
        this.m_vectorizer.setTokenizer(tokenizer);
    }
}
