package de.jetwick.snacktory;

import com.facebook.internal.AnalyticsEvents;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.a.b;
import org.a.c;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;

/* loaded from: classes.dex */
public class ArticleTextExtractor {
    private static final int MIN_ARTICLE_SCORE = 35;
    private Pattern NEGATIVE;
    private Pattern POSITIVE;
    private Pattern UNLIKELY;
    private OutputFormatter formatter = DEFAULT_FORMATTER;
    private String negativeStr;
    private String positiveStr;
    private String unlikelyStr;
    private static final b logger = c.a(ArticleTextExtractor.class);
    private static final Pattern NODES = Pattern.compile("p|div|td|article|section");
    private static final Pattern NEGATIVE_NODES = Pattern.compile("aside|form|input|select|textarea|object|h1");
    private static final Pattern NEGATIVE_STYLE = Pattern.compile("hidden|display: ?none|font-size: ?small");
    private static final Pattern NEGATIVE_INLINED_STYLE = Pattern.compile("width ?:|height ?:|padding|margin");
    private static final Set<String> IGNORED_TITLE_PARTS = new LinkedHashSet<String>() { // from class: de.jetwick.snacktory.ArticleTextExtractor.1
        {
            add("hacker news");
            add("facebook");
        }
    };
    private static final OutputFormatter DEFAULT_FORMATTER = new OutputFormatter();

    /* loaded from: classes.dex */
    public class ImageComparator implements Comparator<ImageResult> {
        public ImageComparator() {
        }

        @Override // java.util.Comparator
        public int compare(ImageResult imageResult, ImageResult imageResult2) {
            return imageResult2.weight.compareTo(imageResult.weight);
        }
    }

    public ArticleTextExtractor() {
        setUnlikely("action|com(bx|ment|munity)|dis(qus|cuss)|e(xtra|[-]?mail)|foot|h(eader|idden)|menu|re(mark|ply)|rss|sh(are|outbox)|sponsora(d|ll|gegate|rchive|ttachment)|(pag(er|ination))|popup|print|l(inks|ogin)|si(debar|gn|ngle)|subscribe|s(lider|hare([-]?)|ocial)");
        setPositive("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))|arti(cle|kel)|instapaper_body");
        setNegative("author|nav($|igation)|load($|ing)|user|com(ment|bx)|(^com-)|co(unter|ntact|unt$)|foot|masthead|meta|outbrain|promo|related|(sho(utbox|pping))|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard|overlay");
    }

    private int calcWeight(Element element) {
        int i = this.POSITIVE.matcher(element.className()).find() ? 35 : 0;
        if (this.POSITIVE.matcher(element.id()).find()) {
            i += 40;
        }
        if (this.POSITIVE.matcher(element.attr("itemprop")).find()) {
            i += 40;
        }
        if (this.UNLIKELY.matcher(element.className()).find()) {
            i -= 30;
        }
        if (this.UNLIKELY.matcher(element.id()).find()) {
            i -= 30;
        }
        if (this.NEGATIVE.matcher(element.className()).find()) {
            i -= 50;
        }
        if (this.NEGATIVE.matcher(element.id()).find()) {
            i -= 50;
        }
        String attr = element.attr(AnalyticsEvents.PARAMETER_LIKE_VIEW_STYLE);
        return (attr == null || attr.isEmpty() || !NEGATIVE_STYLE.matcher(attr).find()) ? i : i - 50;
    }

    private int calcWeightForChild(Element element, String str) {
        int round = ((SHelper.count(str, "&quot;") + SHelper.count(str, "&lt;")) + SHelper.count(str, "&gt;")) + SHelper.count(str, "px") > 5 ? -30 : (int) Math.round(str.length() / 25.0d);
        addScore(element, round);
        return round;
    }

    private String doTitleSplits(String str, String str2) {
        int i = 0;
        String str3 = "";
        String[] split = str.split(str2);
        int length = split.length;
        int i2 = 0;
        while (i2 < length) {
            String str4 = split[i2];
            if (str4.length() > i) {
                i = str4.length();
            } else {
                str4 = str3;
            }
            i2++;
            str3 = str4;
        }
        return str3.replace("&raquo;", " ").replace("»", " ").trim();
    }

    private Set<String> getImageSet(Element element) {
        HashSet hashSet = new HashSet();
        Iterator<Element> it = element.select("img[src]").iterator();
        while (it.hasNext()) {
            hashSet.add(it.next().attr("src"));
        }
        return hashSet;
    }

    private boolean isAdImage(String str) {
        return SHelper.count(str, "ad") >= 2;
    }

    private void print(String str, Element element) {
        print(str, element, "");
    }

    private void print(String str, Element element, String str2) {
        logger.b(str + " " + element.nodeName() + " id=" + element.id() + " class=" + element.className() + " text=" + element.text() + " " + str2);
    }

    private void print(Element element) {
        print("", element, "");
    }

    private void printId(String str, Element element) {
        logger.b(str + " " + element.nodeName() + " id=" + element.id() + " class=" + element.className());
    }

    private void printId(Element element) {
        printId("", element);
    }

    private Document removeScriptsAndStyles(Document document) {
        Iterator<Element> it = document.getElementsByTag("script").iterator();
        while (it.hasNext()) {
            it.next().remove();
        }
        Iterator<Element> it2 = document.getElementsByTag("noscript").iterator();
        while (it2.hasNext()) {
            it2.next().remove();
        }
        Iterator<Element> it3 = document.getElementsByTag(AnalyticsEvents.PARAMETER_LIKE_VIEW_STYLE).iterator();
        while (it3.hasNext()) {
            it3.next().remove();
        }
        return document;
    }

    public ArticleTextExtractor addNegative(String str) {
        setNegative(this.negativeStr + "|" + str);
        return this;
    }

    public ArticleTextExtractor addPositive(String str) {
        return setPositive(this.positiveStr + "|" + str);
    }

    public void addScore(Element element, int i) {
        setScore(element, getScore(element) + i);
    }

    public ArticleTextExtractor addUnlikely(String str) {
        return setUnlikely(this.unlikelyStr + "|" + str);
    }

    public String cleanTitle(String str) {
        int i = 0;
        StringBuilder sb = new StringBuilder();
        for (String str2 : str.split("\\|")) {
            if (!IGNORED_TITLE_PARTS.contains(str2.toLowerCase().trim()) && (i != r3.length - 1 || sb.length() <= str2.length())) {
                if (i > 0) {
                    sb.append("|");
                }
                sb.append(str2);
                i++;
            }
        }
        return SHelper.innerTrim(sb.toString());
    }

    public Element determineImageSource(Element element, List<ImageResult> list) {
        int i;
        boolean z;
        int i2;
        Element element2;
        String attr;
        int i3 = 0;
        Element element3 = null;
        Elements select = element.select("img");
        if (select.isEmpty() && element.parent() != null) {
            select = element.parent().select("img");
        }
        double d = 1.0d;
        Iterator<Element> it = select.iterator();
        while (it.hasNext()) {
            Element next = it.next();
            String attr2 = next.attr("src");
            if (!attr2.isEmpty() && !isAdImage(attr2)) {
                int i4 = 0;
                int i5 = 0;
                try {
                    i5 = Integer.parseInt(next.attr("height"));
                    i4 = i5 >= 50 ? 20 : -20;
                } catch (Exception e) {
                }
                int i6 = 0;
                try {
                    i6 = Integer.parseInt(next.attr("width"));
                    i4 = i6 >= 50 ? i4 + 20 : i4 - 20;
                } catch (Exception e2) {
                }
                String attr3 = next.attr("alt");
                int i7 = attr3.length() > 35 ? i4 + 20 : i4;
                String attr4 = next.attr("title");
                if (attr4.length() > 35) {
                    i7 += 20;
                }
                if (next.parent() == null || (attr = next.parent().attr("rel")) == null || !attr.contains("nofollow")) {
                    i = i7;
                    z = false;
                } else {
                    i = i7 - 40;
                    z = attr.contains("nofollow");
                }
                int i8 = (int) (i * d);
                if (i8 > i3) {
                    d /= 2.0d;
                    element2 = next;
                    i2 = i8;
                } else {
                    i2 = i3;
                    element2 = element3;
                }
                list.add(new ImageResult(attr2, Integer.valueOf(i8), attr4, i5, i6, attr3, z));
                element3 = element2;
                i3 = i2;
            }
        }
        Collections.sort(list, new ImageComparator());
        return element3;
    }

    protected String extractAuthor(Document document) {
        String[] strArr = {"head meta[itemprop=author]", "head meta[name=author]", "head meta[name=publisher]", "head meta[name=twitter:creator]", "head meta[property=article:author]"};
        Element first = document.select("a[rel=author]").first();
        if (first != null) {
            String text = first.text();
            if (text.length() > 0 && SHelper.isSanitized(text)) {
                return text;
            }
            String attr = first.attr("title");
            if (attr.length() > 0 && SHelper.isSanitized(attr)) {
                return attr;
            }
            String text2 = first.text();
            if (text2.length() > 0 && SHelper.isSanitized(text2)) {
                return text2;
            }
        }
        for (String str : strArr) {
            String innerTrim = SHelper.innerTrim(document.select(str).attr("content"));
            if (!innerTrim.isEmpty() && SHelper.isSanitized(innerTrim)) {
                return innerTrim;
            }
        }
        Element first2 = document.select("[itemprop=author]").first();
        if (first2 != null) {
            String text3 = first2.text();
            if (text3.length() > 0 && SHelper.isSanitized(text3)) {
                return text3;
            }
        }
        return null;
    }

    protected String extractCanonicalUrl(Document document) {
        String replaceSpaces = SHelper.replaceSpaces(document.select("head link[rel=canonical]").attr("href"));
        if (!replaceSpaces.isEmpty()) {
            return replaceSpaces;
        }
        String replaceSpaces2 = SHelper.replaceSpaces(document.select("head meta[property=og:url]").attr("content"));
        return replaceSpaces2.isEmpty() ? SHelper.replaceSpaces(document.select("head meta[name=twitter:url]").attr("content")) : replaceSpaces2;
    }

    public JResult extractContent(JResult jResult, String str) {
        return extractContent(jResult, str, this.formatter);
    }

    public JResult extractContent(JResult jResult, String str, OutputFormatter outputFormatter) {
        if (str.isEmpty()) {
            throw new IllegalArgumentException("html string is empty!?");
        }
        return extractContent(jResult, Jsoup.parse(str), outputFormatter);
    }

    public JResult extractContent(JResult jResult, Document document, OutputFormatter outputFormatter) {
        return extractContent(jResult, document, outputFormatter, false);
    }

    public JResult extractContent(JResult jResult, Document document, OutputFormatter outputFormatter, boolean z) {
        ArrayList arrayList;
        Element determineImageSource;
        int i;
        if (document == null) {
            throw new NullPointerException("missing document");
        }
        jResult.setTitle(extractTitle(document));
        jResult.setAuthor(extractAuthor(document));
        jResult.setDate(extractDate(document));
        jResult.setDescription(extractDescription(document));
        jResult.setImageUrl(extractImageUrl(document));
        jResult.setCanonicalUrl(extractCanonicalUrl(document));
        prepareDocument(document);
        Iterator<Element> it = getNodes(document).iterator();
        Element element = null;
        int i2 = 0;
        while (true) {
            if (!it.hasNext()) {
                break;
            }
            Element next = it.next();
            int weight = getWeight(next);
            if (weight <= i2) {
                next = element;
                i = i2;
            } else {
                if (weight > 200) {
                    element = next;
                    break;
                }
                i = weight;
            }
            i2 = i;
            element = next;
        }
        if (element != null) {
            stripUnlikelyCandidates(element);
            if (jResult.getImageUrl().isEmpty() && (determineImageSource = determineImageSource(element, (arrayList = new ArrayList()))) != null) {
                jResult.setImageUrl(SHelper.replaceSpaces(determineImageSource.attr("src")));
                jResult.setImages(arrayList);
            }
            String html = z ? Jsoup.parse(outputFormatter.getFormattedHtml(element)).body().html() : removeTitleFromText(outputFormatter.getFormattedText(element), jResult.getTitle());
            if (html.length() > jResult.getTitle().length()) {
                jResult.setText(html);
            }
            if (jResult.getImageUrl() != null && !jResult.getImageUrl().isEmpty()) {
                if (getImageSet(element).contains(jResult.getImageUrl())) {
                    jResult.setLegendImageUrl(null);
                } else {
                    jResult.setLegendImageUrl(jResult.getImageUrl());
                }
            }
            jResult.setIsArticle(isArticle(document, element));
        } else {
            jResult.setIsArticle(false);
        }
        if (jResult.getImageUrl().isEmpty()) {
            jResult.setImageUrl(extractImageUrl(document));
        }
        jResult.setRssUrl(extractRssUrl(document));
        jResult.setVideoUrl(extractVideoUrl(document));
        jResult.setFaviconUrl(extractFaviconUrl(document));
        jResult.setKeywords(extractKeywords(document));
        return jResult;
    }

    public JResult extractContent(String str) {
        return extractContent(new JResult(), str);
    }

    public JResult extractContent(Document document) {
        return extractContent(new JResult(), document, this.formatter);
    }

    public JResult extractContent(Document document, OutputFormatter outputFormatter) {
        return extractContent(new JResult(), document, outputFormatter);
    }

    public JResult extractContentHtml(JResult jResult, String str) {
        return extractContentHtml(jResult, str, this.formatter);
    }

    public JResult extractContentHtml(JResult jResult, String str, OutputFormatter outputFormatter) {
        if (str.isEmpty()) {
            throw new IllegalArgumentException("html string is empty!?");
        }
        return extractContentHtml(jResult, Jsoup.parse(str), outputFormatter);
    }

    public JResult extractContentHtml(JResult jResult, Document document, OutputFormatter outputFormatter) {
        return extractContent(jResult, document, outputFormatter, true);
    }

    protected String extractDate(Document document) {
        String str = "";
        for (String str2 : new String[]{"[property=article:published_time]", "[property=article:modified_time]", "[property=article:published]", "meta[name=publish_date]", "meta[name=displaydate]", "[itemprop=datePublished]"}) {
            Elements select = document.select(str2);
            str = SHelper.innerTrim(select.attr("content"));
            if (str.isEmpty()) {
                str = SHelper.innerTrim(select.text());
            }
            if (!str.isEmpty()) {
                return SHelper.parseDate(str);
            }
        }
        return str.isEmpty() ? SHelper.parseDate(SHelper.innerTrim(document.select("time[datetime]").attr("datetime"))) : str;
    }

    protected String extractDescription(Document document) {
        String str = "";
        for (String str2 : new String[]{"head meta[name=description]", "head meta[property=og:description", "head meta[name=twitter:description"}) {
            str = SHelper.replaceSpaces(document.select(str2).attr("content"));
            if (!str.isEmpty()) {
                break;
            }
        }
        return str;
    }

    protected String extractFaviconUrl(String str) {
        return extractFaviconUrl(str, "");
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String extractFaviconUrl(String str, String str2) {
        if (str.isEmpty()) {
            throw new IllegalArgumentException("html string is empty");
        }
        Document parse = Jsoup.parse(str);
        parse.setBaseUri(str2);
        return extractFaviconUrl(parse);
    }

    protected String extractFaviconUrl(Document document) {
        Elements select = document.select("link[rel~=(?i)^(shortcut|icon|shortcut icon)$]");
        return !select.isEmpty() ? SHelper.replaceSpaces(select.first().absUrl("href")) : "";
    }

    protected String extractImageUrl(Document document) {
        String str = "";
        for (String str2 : new String[]{"head meta[property=og:image]", "head meta[name=twitter:image]", "head meta[name=twitter:image:src]", "head meta[name=thumbnail]"}) {
            str = SHelper.replaceSpaces(document.select(str2).attr("content"));
            if (!str.isEmpty()) {
                break;
            }
        }
        return str.isEmpty() ? SHelper.innerTrim(document.select("link[rel=image_src]").attr("href")) : str;
    }

    protected Collection<String> extractKeywords(Document document) {
        String innerTrim = SHelper.innerTrim(document.select("head meta[name=keywords]").attr("content"));
        if (innerTrim != null) {
            if (innerTrim.startsWith("[") && innerTrim.endsWith("]")) {
                innerTrim = innerTrim.substring(1, innerTrim.length() - 1);
            }
            String[] split = innerTrim.split("\\s*,\\s*");
            if (split.length > 1 || (split.length > 0 && !"".equals(split[0]))) {
                return Arrays.asList(split);
            }
        }
        return Collections.emptyList();
    }

    protected String extractRssUrl(Document document) {
        return SHelper.replaceSpaces(document.select("link[rel=alternate]").select("link[type=application/rss+xml]").attr("href"));
    }

    protected String extractTitle(Document document) {
        String innerTrim = SHelper.innerTrim(document.title());
        for (String str : new String[]{"head meta[property=og:title]", "head meta[name=twitter:title]", "head meta[name=title]"}) {
            innerTrim = SHelper.innerTrim(document.select(str).attr("content"));
            if (!innerTrim.isEmpty()) {
                break;
            }
        }
        return innerTrim.isEmpty() ? SHelper.innerTrim(document.select("title").text()) : innerTrim;
    }

    protected String extractVideoUrl(Document document) {
        return SHelper.replaceSpaces(document.select("head meta[property=og:video]").attr("content"));
    }

    public Collection<Element> getNodes(Document document) {
        LinkedHashMap linkedHashMap = new LinkedHashMap(64);
        int i = 100;
        Iterator<Element> it = document.select("body").select("*").iterator();
        while (true) {
            int i2 = i;
            if (!it.hasNext()) {
                return linkedHashMap.keySet();
            }
            Element next = it.next();
            if (NODES.matcher(next.tagName()).matches()) {
                linkedHashMap.put(next, null);
                setScore(next, i2);
                i = i2 / 2;
            } else {
                i = i2;
            }
        }
    }

    public int getScore(Element element) {
        try {
            return Integer.parseInt(element.attr("gravityScore"));
        } catch (Exception e) {
            return 0;
        }
    }

    protected int getWeight(Element element) {
        return calcWeight(element) + ((int) Math.round((element.ownText().length() / 100.0d) * 10.0d)) + weightChildNodes(element);
    }

    protected boolean isArticle(Document document, Element element) {
        int i = document.getElementsByAttributeValueContaining("itemtype", "article").size() == 1 ? 15 : 0;
        if (document.getElementsByTag("article").size() == 1) {
            i += 10;
        }
        if (i < 35) {
            i += element.text().length() / 35;
        }
        if (i < 35 && !document.select("meta[property^=article]").isEmpty()) {
            i += 10;
        }
        if (i < 35 && !document.select("meta[property=og:type][content=article]").isEmpty()) {
            i += 10;
        }
        if (i < 35 && !document.select("meta[name=twitter:card][content^=summary]").isEmpty()) {
            i += 5;
        }
        return i >= 35;
    }

    protected void prepareDocument(Document document) {
        removeScriptsAndStyles(document);
    }

    public String removeTitleFromText(String str, String str2) {
        return str;
    }

    public ArticleTextExtractor setNegative(String str) {
        this.negativeStr = str;
        this.NEGATIVE = Pattern.compile(str);
        return this;
    }

    public void setOutputFormatter(OutputFormatter outputFormatter) {
        this.formatter = outputFormatter;
    }

    public ArticleTextExtractor setPositive(String str) {
        this.positiveStr = str;
        this.POSITIVE = Pattern.compile(str);
        return this;
    }

    public void setScore(Element element, int i) {
        element.attr("gravityScore", Integer.toString(i));
    }

    public ArticleTextExtractor setUnlikely(String str) {
        this.unlikelyStr = str;
        this.UNLIKELY = Pattern.compile(str);
        return this;
    }

    protected void stripUnlikelyCandidates(Element element) {
        Iterator<Element> it = element.select("*").iterator();
        while (it.hasNext()) {
            Element next = it.next();
            String lowerCase = next.className().toLowerCase();
            String lowerCase2 = next.tagName().toLowerCase();
            String lowerCase3 = next.id().toLowerCase();
            if (this.NEGATIVE.matcher(lowerCase).find() || this.NEGATIVE.matcher(lowerCase3).find() || NEGATIVE_NODES.matcher(lowerCase2).find()) {
                next.remove();
            } else if ((!next.hasAttr("gravityScore") || Integer.parseInt(next.attr("gravityScore")) <= 0) && (this.UNLIKELY.matcher(lowerCase).find() || this.UNLIKELY.matcher(lowerCase3).find())) {
                next.remove();
            } else {
                if (next.childNodeSize() == 1) {
                    Node childNode = next.childNode(0);
                    if ((childNode instanceof TextNode) && ((TextNode) childNode).isBlank()) {
                        next.remove();
                    }
                }
                String attr = next.attr(AnalyticsEvents.PARAMETER_LIKE_VIEW_STYLE);
                if (!attr.isEmpty() && NEGATIVE_INLINED_STYLE.matcher(attr).find()) {
                    next.removeAttr(AnalyticsEvents.PARAMETER_LIKE_VIEW_STYLE);
                }
                if (next.hasAttr("width")) {
                    next.removeAttr("width");
                }
                if (next.hasAttr("height")) {
                    next.removeAttr("height");
                }
            }
        }
    }

    protected int weightChildNodes(Element element) {
        int i;
        Element element2 = null;
        ArrayList arrayList = new ArrayList(5);
        Iterator<Element> it = element.children().iterator();
        int i2 = 0;
        while (it.hasNext()) {
            Element next = it.next();
            String ownText = next.ownText();
            int length = ownText.length();
            if (length >= 20) {
                if (length > 200) {
                    i2 += Math.max(50, length / 10);
                }
                if (next.tagName().equals("h1") || next.tagName().equals("h2")) {
                    Element element3 = element2;
                    i = i2 + 30;
                    next = element3;
                } else {
                    if (next.tagName().equals("div") || next.tagName().equals("p")) {
                        i2 += calcWeightForChild(next, ownText);
                        if (next.tagName().equals("p") && length > 50) {
                            arrayList.add(next);
                        }
                        if (next.className().toLowerCase().equals("caption")) {
                            i = i2;
                        }
                    }
                    next = element2;
                    i = i2;
                }
                i2 = i;
                element2 = next;
            }
        }
        if (element2 != null) {
            i2 += 30;
        }
        if (arrayList.size() >= 2) {
            Iterator<Element> it2 = element.children().iterator();
            while (it2.hasNext()) {
                Element next2 = it2.next();
                if ("h1;h2;h3;h4;h5;h6".contains(next2.tagName())) {
                    i2 += 20;
                } else if ("table;li;td;th".contains(next2.tagName())) {
                    addScore(next2, -30);
                }
                if ("p".contains(next2.tagName())) {
                    addScore(next2, 30);
                }
            }
        }
        return i2;
    }
}
