package se.lth.cs.srl.preprocessor.tokenization;

import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreLabel;
import java.io.File;
import java.util.Iterator;
import java.util.Properties;
import se.lth.cs.srl.util.FileExistenceVerifier;

/* loaded from: input_file:se/lth/cs/srl/preprocessor/tokenization/StanfordChineseSegmenterWrapper.class */
public class StanfordChineseSegmenterWrapper implements Tokenizer {
    private final CRFClassifier<CoreLabel> classifier;

    public StanfordChineseSegmenterWrapper(File file) {
        File file2 = new File(file, "dict-chris6.ser.gz");
        File file3 = new File(file, "ctb.gz");
        String verifyFiles = FileExistenceVerifier.verifyFiles(file2, file3);
        if (verifyFiles != null) {
            throw new Error(verifyFiles);
        }
        Properties properties = new Properties();
        properties.setProperty("sighanCorporaDict", file.toString());
        properties.setProperty("serDictionary", file2.toString());
        properties.setProperty("inputEncoding", "UTF-8");
        properties.setProperty("sighanPostProcessing", "true");
        this.classifier = new CRFClassifier<>(properties);
        this.classifier.loadClassifierNoExceptions(file3.toString(), properties);
        this.classifier.flags.setProperties(properties);
    }

    @Override // se.lth.cs.srl.preprocessor.tokenization.Tokenizer
    public String[] tokenize(String str) {
        String[] strArr = (String[]) this.classifier.segmentString(str).toArray();
        String[] strArr2 = new String[strArr.length + 1];
        strArr2[0] = "<root>";
        System.arraycopy(strArr, 0, strArr2, 1, strArr.length);
        return strArr2;
    }

    public static void main(String[] strArr) throws Exception {
        new String[1][0] = "chi-sen.deseg";
        Properties properties = new Properties();
        properties.setProperty("sighanCorporaDict", String.valueOf("/home/users0/anders/storage/scratch/anders/stanford-segmenter-2013-06-20/") + "/data");
        properties.setProperty("serDictionary", String.valueOf("/home/users0/anders/storage/scratch/anders/stanford-segmenter-2013-06-20/") + "/data/dict-chris6.ser.gz");
        properties.setProperty("inputEncoding", "UTF-8");
        properties.setProperty("sighanPostProcessing", "true");
        CRFClassifier cRFClassifier = new CRFClassifier(properties);
        cRFClassifier.loadClassifierNoExceptions(String.valueOf("/home/users0/anders/storage/scratch/anders/stanford-segmenter-2013-06-20/") + "/data/ctb.gz", properties);
        cRFClassifier.flags.setProperties(properties);
        Iterator it = cRFClassifier.segmentString("上海浦东近年来颁布实行了涉及经济、贸易、建设、规划、科技、文教等领域的七十一件法规性文件，确保了浦东开发的有序进行。").iterator();
        while (it.hasNext()) {
            System.out.println((String) it.next());
        }
    }
}
