import * as cheerio from "cheerio";
import he from "he";

import { sanitizeLink } from "@/crawler/sanitizeLinks";
import { ElementType } from "domelementtype";
const keepTags = ["h1", "h2", "h3", "h4", "h5", "h6", "table", "code", "pre", "blockquote", "time"];

export function htmlToDom(bodyHTML: string): cheerio.CheerioAPI {
  const dom = cheerio.load(bodyHTML);
  return dom;
}

export function htmlToText(
  htmlOrDoc: cheerio.CheerioAPI | string,
  keepUrls?: boolean,
  skipShortBodyCheck?: boolean,
): string {
  const $ = typeof htmlOrDoc == "string" ? htmlToDom(htmlOrDoc) : htmlOrDoc;

  // pre-process by removing unwanted sections
  [
    "head",
    "iframe",
    "style",
    "noscript",
    ".modal",
    ".seo",
    "[ad-id]",
    "#msg-overlay",
    ".post-block--unread", // techcrunch collapsed articles
  ].forEach((tag) => {
    $(tag).remove();
  });

  // Remove script tags, except those with type="application/json"
  $("script").each((_, el) => {
    const $el = $(el);
    if ($el.attr("type") !== "application/json") {
      $el.remove();
    }
  });

  $(".show-more-less-text__text--more").each((_, el) => {
    const element = $(el);
    const prev = element.prev();
    if (prev?.hasClass("show-more-less-text__text--less")) {
      prev.remove();
    }
  });

  $("[data-testid='UserJoinDate']").each((_, el) => {
    const element = $(el);
    element.text("\nJoined Twitter: " + element.text());
  });

  // if the page is pretty small, skip removing further elements
  const bodyLength = $("body").text().length || 1;
  if (skipShortBodyCheck || bodyLength > 1000) {
    [
      "page-header",
      "page-footer",
      "nav",
      "form[action]",
      "code",
      "[data-test-id=browse-jobs]",
      "[data-test-id=similar-pages]",
      "[data-test-id=sidebarColumn]",
      "button",
      "input",
      "textarea",
      "select",
      "ins",
      ".footer, #footer",
      ".footer-contacts",
      ".header, #header",
      ".cn-fundraising",
      "header[role=banner]",
      "[aria-label=Trending]",
      "[role=navigation]",
      "[role=complementary]",
      "[class*=top-stories]",
      "[class*=recirc]",
      "[aria-hidden=true]", // this could be too aggressive but in general probably helps
    ].forEach((tag) => {
      const contentLength = $(tag).text().length;
      // only remove the tag if it's less than 30% of the body
      // (this is a heuristic to avoid removing important content)
      if (contentLength / bodyLength < 0.3) {
        $(tag).remove();
      }
    });
  }

  const result = htmlElemToText($, $("body"), keepUrls);

  return result;
}

const blockElems = ["div", "p", "br", "tr", "table", "form", "ul", "ol", "li"];

// eslint-disable-next-line @typescript-eslint/max-params
export function htmlElemToText(
  $: cheerio.CheerioAPI,
  elem: cheerio.Cheerio<cheerio.AnyNode>,
  keepUrls?: boolean,
  skipDecorateLists?: boolean,
): string {
  const linkTags = keepUrls ? ["a", "img", "iframe"] : [];

  // Helper function to process a node and its children
  const processNode = (node: cheerio.Cheerio<cheerio.AnyNode>): string => {
    const lines: string[] = [];
    let hasPrev = false;
    node.contents().each((_, el) => {
      const element = $(el);

      // Process text node
      if (el.type === ElementType.Text) {
        const text = he.decode(element.text());
        if (!uselessContent.has(text.trim().toLowerCase())) {
          lines.push(text);
        }
        return;
      }

      const classNames = element.attr("class")?.split(" ") || [];
      if (
        classNames.includes("hidden") ||
        classNames.includes("hide") ||
        classNames.includes("navigation")
      )
        return;

      // ignore invisible elements
      const displaySetting = element.css("display");
      if (
        displaySetting === "none" ||
        element.css("visibility") === "hidden" ||
        element.css("opacity") === "0"
      )
        return;

      // heuristic: positioned elements are probably not part of the main content
      // -- except for some reason twitter puts their tweets in an absolute div
      const position = element.css("position");
      if (position == "fixed") return;

      // Process element node
      const tagName = (el as unknown as Element).tagName?.toLowerCase();
      if (!tagName) return;

      // Preserve the links and images if keepUrls is true
      if (keepTags.includes(tagName) || linkTags.includes(tagName)) {
        const textContent = processNode(element).trim();
        if (!textContent && tagName != "img") return;
        const goodAttributes =
          keepUrls ? ["href", "src", "title", "alt", "width", "height", "aria-label"] : [];
        let newElement = `<${tagName}`;

        // only keep elements with valid links
        let hasLink = false;
        goodAttributes.forEach((attr) => {
          let val = element.attr(attr);
          if (val) {
            if (attr == "href" || attr == "src") {
              val = sanitizeLink(val) ?? undefined;
              if (val) {
                hasLink = true;
              }
            }
            newElement += ` ${attr}="${val}"`;
          }
        });
        if ((tagName == "a" || tagName == "img") && !hasLink) return;
        newElement += textContent ? `>${textContent}</${tagName}>` : ` />`;
        lines.push(newElement);
        return;
      }

      const contents = processNode(element);

      if (contents.trim() && tagName === "li" && !skipDecorateLists) {
        lines.push("- ");
      }

      const isInlineElem = displaySetting === "inline" || !blockElems.includes(tagName);
      if (hasPrev && !isInlineElem) lines.push("\n");

      // Recurse into the children
      if (contents) lines.push(contents);

      // Add extra text depending on the tag
      if (!isInlineElem) {
        lines.push("\n");
      } else if (tagName === "hr") {
        lines.push("\n---\n");
      } else if (tagName === "td") {
        lines.push("\t");
      }

      hasPrev = true;
    });

    return lines.join("");
  };

  let result = processNode(elem);

  // sanitize unicode
  result = sanitizeUnicode(result);

  // remove blank lines
  result = result.replace(/\n\s*\n/g, "\n");

  // remove extra spaces
  result = result.replace(/ +/g, " ");

  // remove leading and trailing spaces for each line
  result = result.replace(/\n\s+|\s+\n/g, "\n");

  return result.trim();
}

// commonly occuring useless phrases in websites. lowercase.
const uselessContent = new Set([
  "video player",
  "javascript is required",
  "skip to main content",
  "continue reading",
  "join now",
  "welcome back",
  "user agreement",
  "privacy policy",
  "terms of service",
  "cookie policy",
  "gdpr notice",
  "follow",
  "like",
  "get the app",
  "sign up for free",
  "continue with google",
  "continue with facebook",
  "sign in",
  "sign up",
  "sign in with google",
  "sign in with facebook",
  "sign up with google",
  "sign up with facebook",
  "all rights reserved",
  "status is offline",
  "click to upgrade to premium",
]);

export function stripAllTags(html: string, replaceUrls?: boolean): string {
  let output = html
    .replace(/<[^ ][^>]*>?/gm, "")
    .replace(/\s+/g, " ")
    .trim();

  if (replaceUrls) output = output.replace(/https?:\/\/\S+/g, "<url>");

  return output;
}

export const sanitizeUnicode = (str: string) => {
  return str
    .replace(/[\u{D800}-\u{DFFF}]/gu, "")
    .replace(/[\x80-\xFF]/g, "")
    .replace(/\x00/g, "");
};
