package com.datumbox.framework.applications.nlp;

import com.datumbox.framework.common.Configuration;
import com.datumbox.framework.common.dataobjects.AssociativeArray;
import com.datumbox.framework.common.dataobjects.Dataframe;
import com.datumbox.framework.common.dataobjects.FlatDataCollection;
import com.datumbox.framework.common.dataobjects.Record;
import com.datumbox.framework.common.interfaces.Parameterizable;
import com.datumbox.framework.common.utilities.MapMethods;
import com.datumbox.framework.common.utilities.PHPMethods;
import com.datumbox.framework.common.utilities.StringCleaner;
import com.datumbox.framework.core.machinelearning.clustering.Kmeans;
import com.datumbox.framework.core.statistics.descriptivestatistics.Descriptives;
import com.datumbox.framework.core.utilities.text.parsers.HTMLParser;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.jfree.chart.ChartPanel;

/* loaded from: input_file:com/datumbox/framework/applications/nlp/CETR.class */
public class CETR {
    private static final Pattern NUMBER_OF_TAGS_PATTERN = Pattern.compile("<[^>]+?>", 32);
    private final String dbName;
    private final Configuration conf;

    /* loaded from: input_file:com/datumbox/framework/applications/nlp/CETR$Parameters.class */
    public static class Parameters implements Parameterizable {
        private static final long serialVersionUID = 1;
        private int numberOfClusters = 2;
        private int alphaWindowSizeFor2DModel = 3;
        private int smoothingAverageRadius = 2;

        public int getNumberOfClusters() {
            return this.numberOfClusters;
        }

        public void setNumberOfClusters(int i) {
            this.numberOfClusters = i;
        }

        public int getAlphaWindowSizeFor2DModel() {
            return this.alphaWindowSizeFor2DModel;
        }

        public void setAlphaWindowSizeFor2DModel(int i) {
            this.alphaWindowSizeFor2DModel = i;
        }

        public int getSmoothingAverageRadius() {
            return this.smoothingAverageRadius;
        }

        public void setSmoothingAverageRadius(int i) {
            this.smoothingAverageRadius = i;
        }
    }

    public CETR(String str, Configuration configuration) {
        this.dbName = str;
        this.conf = configuration;
    }

    public String extract(String str, Parameters parameters) {
        String clearText = clearText(str);
        List<String> extractRows = extractRows(clearText);
        List<Integer> selectRows = selectRows(extractRows, parameters);
        StringBuilder sb = new StringBuilder(clearText.length());
        Iterator<Integer> it = selectRows.iterator();
        while (it.hasNext()) {
            String removeExtraSpaces = StringCleaner.removeExtraSpaces(HTMLParser.extractText(extractRows.get(it.next().intValue())));
            if (!removeExtraSpaces.isEmpty()) {
                sb.append(removeExtraSpaces).append(StringUtils.SPACE);
            }
        }
        return sb.toString().trim();
    }

    private List<Integer> selectRows(List<String> list, Parameters parameters) {
        List<Double> calculateTTRlist = calculateTTRlist(list);
        gaussianSmoothing(calculateTTRlist);
        boolean z = parameters.getAlphaWindowSizeFor2DModel() > 0;
        Dataframe dataframe = new Dataframe(this.conf);
        if (z) {
            List<Double> computeDerivatives = computeDerivatives(calculateTTRlist, parameters.getAlphaWindowSizeFor2DModel());
            gaussianSmoothing(computeDerivatives);
            int size = calculateTTRlist.size();
            for (int i = 0; i < size; i++) {
                AssociativeArray associativeArray = new AssociativeArray();
                associativeArray.put(0, calculateTTRlist.get(i));
                associativeArray.put(1, computeDerivatives.get(i));
                dataframe.add(new Record(associativeArray, null));
            }
        } else {
            int size2 = calculateTTRlist.size();
            for (int i2 = 0; i2 < size2; i2++) {
                AssociativeArray associativeArray2 = new AssociativeArray();
                associativeArray2.put(0, calculateTTRlist.get(i2));
                dataframe.add(new Record(associativeArray2, null));
            }
        }
        performClustering(dataframe, parameters.getNumberOfClusters());
        HashMap hashMap = new HashMap();
        HashMap hashMap2 = new HashMap();
        Iterator<Record> it = dataframe.iterator();
        while (it.hasNext()) {
            Record next = it.next();
            Integer num = (Integer) next.getYPredicted();
            Double d = next.getX().getDouble(0);
            Double d2 = (Double) hashMap.get(num);
            Integer num2 = (Integer) hashMap2.get(num);
            if (d2 == null) {
                d2 = Double.valueOf(0.0d);
                num2 = 0;
            }
            hashMap.put(num, Double.valueOf(d2.doubleValue() + d.doubleValue()));
            hashMap2.put(num, Integer.valueOf(num2.intValue() + 1));
        }
        for (Map.Entry entry : hashMap.entrySet()) {
            hashMap.put((Integer) entry.getKey(), Double.valueOf(((Double) entry.getValue()).doubleValue() / ((Integer) hashMap2.get(r0)).intValue()));
        }
        Integer num3 = (Integer) MapMethods.selectMinKeyValue(hashMap).getKey();
        ArrayList arrayList = new ArrayList();
        for (Map.Entry<Integer, Record> entry2 : dataframe.entries()) {
            Integer key = entry2.getKey();
            if (!Objects.equals((Integer) entry2.getValue().getYPredicted(), num3)) {
                arrayList.add(key);
            }
        }
        dataframe.delete();
        return arrayList;
    }

    private void performClustering(Dataframe dataframe, int i) {
        Kmeans kmeans = new Kmeans(this.dbName, this.conf);
        Kmeans.TrainingParameters trainingParameters = new Kmeans.TrainingParameters();
        trainingParameters.setK(i);
        trainingParameters.setMaxIterations(ChartPanel.DEFAULT_MINIMUM_DRAW_HEIGHT);
        trainingParameters.setInitializationMethod(Kmeans.TrainingParameters.Initialization.SET_FIRST_K);
        trainingParameters.setDistanceMethod(Kmeans.TrainingParameters.Distance.EUCLIDIAN);
        trainingParameters.setWeighted(false);
        trainingParameters.setCategoricalGamaMultiplier(1.0d);
        kmeans.fit(dataframe, (Dataframe) trainingParameters);
        kmeans.predict(dataframe);
        kmeans.delete();
    }

    private List<Double> calculateTTRlist(List<String> list) {
        ArrayList arrayList = new ArrayList();
        for (String str : list) {
            int countContentChars = countContentChars(str);
            int countNumberOfTags = countNumberOfTags(str);
            if (countNumberOfTags == 0) {
                countNumberOfTags = 1;
            }
            arrayList.add(Double.valueOf(countContentChars / countNumberOfTags));
        }
        return arrayList;
    }

    private List<Double> gaussianSmoothing(List<Double> list) {
        int size = list.size();
        double std = Descriptives.std(new FlatDataCollection(list), false);
        double d = std * std;
        int min = (int) Math.min(Math.ceil(std), (size - 1.0d) / 2.0d);
        ArrayList arrayList = new ArrayList((2 * min) + 1);
        double d2 = 0.0d;
        for (int i = 0; i <= 2 * min; i++) {
            double d3 = 0.0d;
            for (int i2 = -min; i2 <= min; i2++) {
                d3 += Math.exp(((-i2) * i2) / (2.0d * d));
            }
            arrayList.add(Double.valueOf(d3));
            d2 += d3;
        }
        for (int i3 = 0; i3 <= 2 * min; i3++) {
            arrayList.set(i3, Double.valueOf(((Double) arrayList.get(i3)).doubleValue() / d2));
        }
        ArrayList arrayList2 = new ArrayList(size);
        for (int i4 = 0; i4 < size; i4++) {
            double d4 = 0.0d;
            for (int i5 = -min; i5 <= min; i5++) {
                int i6 = i4 - i5;
                if (i6 >= 0 && i6 < size) {
                    d4 += ((Double) arrayList.get(i5 + min)).doubleValue() * list.get(i6).doubleValue();
                }
            }
            arrayList2.add(Double.valueOf(d4));
        }
        return arrayList2;
    }

    private List<Double> computeDerivatives(List<Double> list, int i) {
        int size = list.size();
        ArrayList arrayList = new ArrayList(size);
        for (int i2 = 0; i2 < size; i2++) {
            double d = 0.0d;
            int i3 = 0;
            for (int i4 = 0; i4 < i; i4++) {
                int i5 = i2 + i4;
                if (i5 >= 0 && i5 < size) {
                    d += list.get(i5).doubleValue();
                    i3++;
                }
            }
            if (i3 == 0) {
                i3 = 1;
            }
            arrayList.add(Double.valueOf(Math.abs((d / i3) - list.get(i2).doubleValue())));
        }
        return arrayList;
    }

    private int countNumberOfTags(String str) {
        int i = 0;
        while (NUMBER_OF_TAGS_PATTERN.matcher(str).find()) {
            i++;
        }
        return i;
    }

    private int countContentChars(String str) {
        return StringCleaner.removeExtraSpaces(HTMLParser.extractText(str)).length();
    }

    private List<String> extractRows(String str) {
        return Arrays.asList(str.split(StringUtils.LF));
    }

    private String clearText(String str) {
        String removeNonTextTagsAndAttributes = HTMLParser.removeNonTextTagsAndAttributes(str);
        if (PHPMethods.substr_count(removeNonTextTagsAndAttributes, '\n') <= 1) {
            removeNonTextTagsAndAttributes = removeNonTextTagsAndAttributes.replace(">", ">\n");
        }
        return removeNonTextTagsAndAttributes.replaceAll("[\\n\\r]+", StringUtils.LF).replaceAll("(?m)^[ \t]*\r?\n", "").trim();
    }
}
