package pl.waw.ipipan.zil.summ.nicolas.utils;

import com.google.common.collect.Sets;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
import pl.waw.ipipan.zil.summ.nicolas.Constants;
import weka.core.TestInstances;

/* loaded from: input_file:pl/waw/ipipan/zil/summ/nicolas/utils/TextUtils.class */
public class TextUtils {
    private TextUtils() {
    }

    public static List<String> tokenize(String str) {
        return Arrays.asList(str.split("[^\\p{L}0-9]+"));
    }

    public static List<String> tokenizeOnWhitespace(String str) {
        return Arrays.asList(str.split(" +"));
    }

    public static String loadSentence2Orth(TSentence tSentence) {
        return loadSentence2Orth(tSentence, Sets.newHashSet());
    }

    public static String loadSentence2Orth(TSentence tSentence, Set<String> set) {
        StringBuilder sb = new StringBuilder();
        for (TToken tToken : tSentence.getTokens()) {
            if (!set.contains(tToken.getId())) {
                if (!tToken.isNoPrecedingSpace()) {
                    sb.append(TestInstances.DEFAULT_SEPARATORS);
                }
                sb.append(tToken.getOrth());
            }
        }
        return sb.toString().trim();
    }

    public static String loadSentence2OrthExcludingStoptags(TSentence tSentence) {
        return loadSentence2Orth(tSentence, (Set) tSentence.getTokens().stream().filter(tToken -> {
            return Constants.STOP_POS_TAGS.contains(tToken.getChosenInterpretation().getCtag());
        }).map((v0) -> {
            return v0.getId();
        }).collect(Collectors.toSet()));
    }
}
