package arabi.tools.arwiki.text;

import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Whitelist;

/* loaded from: input_file:arabi/tools/arwiki/text/ArticleProcessing.class */
public class ArticleProcessing {
    private String ar_article_name;

    public ArticleProcessing(String str) {
        this.ar_article_name = str;
    }

    public String getAr_article_name() {
        return this.ar_article_name;
    }

    public void setAr_article_name(String str) {
        this.ar_article_name = str;
    }

    public ArrayList<WikiLang> getArticleTitleInDiffrentLanguages() throws IOException, ParseException {
        ArrayList<WikiLang> arrayList = new ArrayList<>();
        String unescapeJava = StringEscapeUtils.unescapeJava(Jsoup.connect("https://ar.wikipedia.org/w/api.php?action=query&titles=" + getAr_article_name().replace(" ", "_") + "&prop=langlinks&lllimit=500&format=json").ignoreContentType(true).maxBodySize(47483647).execute().body());
        JSONObject jSONObject = new JSONObject(unescapeJava);
        new JSONArray();
        try {
            JSONArray jSONArray = jSONObject.getJSONObject("query").getJSONObject("pages").getJSONObject(unescapeJava.split("pageid\":")[1].split(",")[0].trim()).getJSONArray("langlinks");
            for (int i = 0; i < jSONArray.length(); i++) {
                arrayList.add(new WikiLang(jSONArray.getJSONObject(i).getString("lang"), jSONArray.getJSONObject(i).getString("*")));
            }
        } catch (Exception e) {
            System.err.println("ERROR reading article: " + getAr_article_name());
        }
        return arrayList;
    }

    public ArrayList<String> getTags() throws IOException {
        ArrayList<String> arrayList = new ArrayList<>();
        String str = "<html><body><table>" + Jsoup.parse(StringEscapeUtils.unescapeJava(Jsoup.connect("https://ar.wikipedia.org/wiki/" + getAr_article_name().replace(" ", "_")).ignoreContentType(true).maxBodySize(47483647).execute().body())).select("div.mw-normal-catlinks").first().html() + "</table></body></html>";
        Whitelist none = Whitelist.none();
        none.addTags("li");
        String[] split = Jsoup.clean(str, none).split("<li>");
        split[split.length - 1] = split[split.length - 1].replace(" :", StringUtils.EMPTY);
        for (int i = 1; i < split.length; i++) {
            arrayList.add(split[i].trim().replace("</li>", StringUtils.EMPTY).trim());
        }
        return arrayList;
    }

    public String getSnippet() throws IOException {
        return StringEscapeUtils.unescapeJava(Jsoup.connect("https://ar.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&redirects=1&titles=" + getAr_article_name().replace(" ", "_")).ignoreContentType(true).execute().body()).split("extract\":\"")[1].split("\"}")[0].trim().replace("\"", "\\\"");
    }

    public String getPageContent() throws IOException {
        return Jsoup.connect("https://ar.wikipedia.org/wiki/" + getAr_article_name().replace(" ", "_")).ignoreContentType(true).maxBodySize(47483647).get().text();
    }

    public String getPageHTML() throws IOException {
        return Jsoup.connect("https://ar.wikipedia.org/wiki/" + getAr_article_name().replace(" ", "_")).ignoreContentType(true).maxBodySize(47483647).get().html();
    }

    private boolean isProbablyArabic(String str) {
        int i = 0;
        while (true) {
            int i2 = i;
            if (i2 >= str.length()) {
                return false;
            }
            int codePointAt = str.codePointAt(i2);
            if (codePointAt >= 1536 && codePointAt <= 1760) {
                return true;
            }
            i = i2 + Character.charCount(codePointAt);
        }
    }

    public String getArabicPlainText() throws IOException {
        String trim = Jsoup.connect("https://ar.wikipedia.org/wiki/" + getAr_article_name().replace(" ", "_")).ignoreContentType(true).maxBodySize(47483647).get().text().split("\t")[0].trim().replaceAll("\\p{P}", " ").replaceAll("[a-zA-Z0-9]", StringUtils.EMPTY).replace("٠", StringUtils.EMPTY).replace("١", StringUtils.EMPTY).replace("٢", StringUtils.EMPTY).replace("٣", StringUtils.EMPTY).replace("٤", StringUtils.EMPTY).replace("٥", StringUtils.EMPTY).replace("٦", StringUtils.EMPTY).replace("٧", StringUtils.EMPTY).replace("٨", StringUtils.EMPTY).replace("٩", StringUtils.EMPTY).replace("=", " ").replace("|", " ").replace("^", " ").replaceAll("\\s+", " ").trim();
        String str = StringUtils.EMPTY;
        String[] split = trim.split(" ");
        for (int i = 0; i < split.length; i++) {
            if (isProbablyArabic(split[i].trim())) {
                str = String.valueOf(str) + split[i].trim() + " ";
            }
        }
        return str.trim();
    }

    public String getStructuredReport() throws IOException, ParseException {
        String unescapeJava = StringEscapeUtils.unescapeJava(Jsoup.connect("https://ar.wikipedia.org/wiki/" + getAr_article_name().replace(" ", "_")).ignoreContentType(true).execute().body());
        Document parse = Jsoup.parse(unescapeJava);
        boolean z = false;
        Element first = parse.select("div.infobox").first();
        if (first == null) {
            first = parse.select("table.infobox.infobox_v2").first();
            z = true;
        }
        if (first == null) {
            first = parse.select("table.infobox").first();
        }
        if (first == null) {
            first = parse.select("div.infobox_v3").first();
        }
        String replace = ("<html><body><table>" + first.html() + "</table></body></html>").replace("<br>", " %@% ").replace("<br />", " %@% ").replace("<br/>", " %@% ");
        Whitelist none = Whitelist.none();
        none.addTags("span", "td", "th", "li", "a");
        none.addAttributes("th", "scope");
        none.addAttributes("th", "colspan");
        String[] split = Jsoup.clean(replace, none).split("\n");
        JSONObject jSONObject = new JSONObject();
        JSONArray jSONArray = new JSONArray();
        for (int i = 0; i < split.length - 1; i++) {
            String trim = split[i].trim();
            String trim2 = split[i + 1].trim();
            if (trim.contains("<th colspan=\"2\"")) {
                trim = cleanHeader(trim).replace("<td>", " ").replace("</td>", " ").replaceAll("\\s+", " ");
                jSONArray.put(trim);
            }
            boolean z2 = trim.contains("<th scope=\"row\">") && trim2.substring(0, 4).equals("<td>");
            boolean z3 = trim.contains(":") && trim.contains("<td") && z && trim2.substring(0, 4).equals("<td>");
            if (z2 || z3) {
                String cleanHeaderV2 = z3 ? cleanHeaderV2(trim) : cleanHeader(trim);
                String cleanData = cleanData(trim2);
                if (cleanData.contains("%@%")) {
                    String[] split2 = cleanData.split("%@%");
                    JSONArray jSONArray2 = new JSONArray();
                    for (String str : split2) {
                        String trim3 = str.trim();
                        if (trim3.length() > 0) {
                            jSONArray2.put(trim3);
                        }
                    }
                    jSONObject.put(cleanHeaderV2, jSONArray2);
                } else {
                    jSONObject.put(cleanHeaderV2, cleanData);
                }
            }
        }
        jSONObject.put("كلمات مفتاحية", getTagsInner(unescapeJava));
        jSONObject.put("ملخص", getSnippetInner(getAr_article_name(), "ar"));
        jSONObject.put("صورة", getPicInner(getAr_article_name(), "ar"));
        return StringEscapeUtils.unescapeJava(jSONObject.toString());
    }

    private JSONArray getTagsInner(String str) {
        try {
            String str2 = "<html><body><table>" + Jsoup.parse(str).select("div.mw-normal-catlinks").first().html() + "</table></body></html>";
            Whitelist none = Whitelist.none();
            none.addTags("li");
            String[] split = Jsoup.clean(str2, none).split("<li>");
            split[split.length - 1] = split[split.length - 1].replace(" :", StringUtils.EMPTY);
            JSONArray jSONArray = new JSONArray();
            for (int i = 1; i < split.length; i++) {
                jSONArray.put(split[i].trim().replace("</li>", StringUtils.EMPTY).trim());
            }
            return jSONArray;
        } catch (Exception e) {
            return new JSONArray();
        }
    }

    private String getSnippetInner(String str, String str2) {
        try {
            return StringEscapeUtils.unescapeJava(Jsoup.connect("https://" + str2 + ".wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&redirects=1&titles=" + str.replace(" ", "_")).ignoreContentType(true).execute().body()).split("extract\":\"")[1].split("\"}")[0].trim().replace("\"", "-").replaceAll("\\s+", " ");
        } catch (Exception e) {
            return StringUtils.EMPTY;
        }
    }

    private String getPicInner(String str, String str2) {
        try {
            return StringEscapeUtils.unescapeJava(Jsoup.connect("https://" + str2 + ".wikipedia.org/w/api.php?action=query&titles=" + str.replace(" ", "_") + "&prop=pageimages&format=json&pithumbsize=2000").ignoreContentType(true).execute().body()).split("\"source\":\"")[1].split("\"")[0].trim();
        } catch (Exception e) {
            return StringUtils.EMPTY;
        }
    }

    public String getPic() {
        try {
            return StringEscapeUtils.unescapeJava(Jsoup.connect("https://ar.wikipedia.org/w/api.php?action=query&titles=" + getAr_article_name().replace(" ", "_") + "&prop=pageimages&format=json&pithumbsize=2000").ignoreContentType(true).execute().body()).split("\"source\":\"")[1].split("\"")[0].trim();
        } catch (Exception e) {
            return StringUtils.EMPTY;
        }
    }

    private static String cleanHeaderV2(String str) {
        return str.replace("<th colspan=\"2\">", StringUtils.EMPTY).replace("<th scope=\"row\">", StringUtils.EMPTY).replace("</th>", StringUtils.EMPTY).replace("<td>", StringUtils.EMPTY).replace("</td>", StringUtils.EMPTY).replace("<a>", StringUtils.EMPTY).replace("</a>", StringUtils.EMPTY).replace("&nbsp;", " ").replace("<span>", StringUtils.EMPTY).replace("</span>", StringUtils.EMPTY).replace("•", StringUtils.EMPTY).replaceAll("\\s+", " ").trim();
    }

    private static String cleanHeader(String str) {
        return str.replace("<th colspan=\"2\">", StringUtils.EMPTY).replace("<th scope=\"row\">", StringUtils.EMPTY).replace("</th>", StringUtils.EMPTY).replace("<a>", StringUtils.EMPTY).replace("</a>", StringUtils.EMPTY).replace("&nbsp;", " ").replace("<span>", StringUtils.EMPTY).replace("</span>", StringUtils.EMPTY).replaceAll("\\s+", " ").trim();
    }

    private static String cleanData(String str) {
        return str.replace("، &nbsp;", " %@% ").replace("&nbsp;", " ").replace("، ", " %@% ").replace(", ", " %@% ").replace("<td>", StringUtils.EMPTY).replace("</td>", StringUtils.EMPTY).replace("<span>", StringUtils.EMPTY).replace("</span>", StringUtils.EMPTY).replace("</a>,", " %@% ").replace("</a>،", " %@% ").replace("<a>", StringUtils.EMPTY).replace("</a>", StringUtils.EMPTY).replace("<li>", StringUtils.EMPTY).replace("</li>", " %@% ").replaceAll("\\[.*?\\]", StringUtils.EMPTY).replace("—", " %@% ").replaceAll("\\s+", " ").trim();
    }
}
