import { truncate } from "@/lib/stringUtils";
import type { TraceContext } from "@/lib/trace";
import { Attribute, LinkedinCompanyProfile, LinkedinProfile } from "@/types/attributes";
import { CookieList, CookieMonster } from "@/types/cookies";
import { EntityType } from "@/types/enums";
import type { User } from "@prisma/client";
import type { JsonObject } from "@prisma/client/runtime/library";

export interface Crawler {
  options: CrawlerOptions;
  pagesVisited: string[];
  retries?: number;
  userEmail?: string;
  cookies: CookieList;

  run(
    urls: string[],
    onResult: (result: CrawlResult) => void | Promise<void>,
    onError?: (url: string, error: unknown) => void | Promise<void>,
  ): Promise<void>;
  crawlOne(url: string, screenshot?: boolean): Promise<CrawlResult>;
}

export interface CrawlResult {
  url: string;
  requestUrl?: string;
  title: string;
  description?: string;
  body: string;
  links: Link[];
  screenshotUrl?: string;
  favicon?: string;
  lastModified?: Date; // from last-modified headers
  publishDate?: string; // from meta og:publish_date
  socialTags: Record<string, string>;
  structuredData?: JsonObject[];
  rssFeeds?: Link[];
  raw?: string;
  linkedin?: LinkedinProfile;
  linkedinCompany?: LinkedinCompanyProfile;
}

export enum CrawlerType {
  Cheerio = "cheerio",
  CrunchbasePerson = "crunchbase-person",
  CrunchbasePersonInvestments = "crunchbase-person-investments",
  CrunchbasePersonPartnerInvestments = "crunchbase-person-partner-investments",
  CrunchbaseBatchData = "crunchbase-batch-data",
  CrunchbaseCompany = "crunchbase-company",
  CrunchbaseVerifier = "crunchbase-verifier",
  CrunchbaseCompanyEmployees = "crunchbase-company-employees",
  CrunchbaseCompanyAdvisors = "crunchbase-company-advisors",
  CrunchbaseCompanyFunding = "crunchbase-company-funding",
  CrunchbaseCompanyFundingRound = "crunchbase-company-funding-round",
  CrunchbaseCompanyInvestments = "crunchbase-company-investments",
  Github = "github",
  LambdaPlaywright = "playwright-lambda",
  LinkedinCompany = "linkedin-company",
  LinkedinCompanyMutuals = "linkedin-company-mutuals",
  LinkedinMutuals = "linkedin-mutuals",
  LinkedinPerson = "linkedin-person",
  LinkedinPost = "linkedin-post",
  NFXFirm = "nfx-firm",
  NFXInvestor = "nfx-investor",
  PitchBook = "pitchbook",
  Serp = "serp",
  Unblocker = "unblocker",
  VCSheetCompany = "vcsheet-company",
  Web = "web",
}

export interface CrawlerOptions {
  cookies?: CookieList;
  cookieManagers?: CookieMonster[];
  headers?: Record<string, string>;
  includeRawResult?: boolean;
  preserveContext?: boolean;
  retries?: number;
  screenshot?: boolean;
  skipCache?: boolean;
  skipFallbackCookies?: boolean;
  timeoutMillis?: number;
  traceContext?: TraceContext;
  user?: { email: string } | User;
  userAgent?: string;
  verbose?: boolean;
}

export function isCrawlResult(result: unknown): result is CrawlResult {
  const crawlResult = result as CrawlResult;
  return (
    typeof crawlResult?.url === "string" &&
    typeof crawlResult?.body === "string" &&
    Array.isArray(crawlResult?.links) &&
    (crawlResult?.links).every((l) => typeof l?.url === "string")
  );
}

export type CrawlResultSansBody = Omit<CrawlResult, "body">;

export type CrawlEvent = {
  url: string;
  cookies?: CookieList;
  screenshot?: boolean;
  userEmail?: string;
  includeRawResult?: boolean;
  userAgent?: string;
  traceContext?: TraceContext;
};

export type Link = { url: string; title?: string };

export type LinkWithDescription = Link & { description?: string };

export type ImageDetails = {
  src: string | null;
  alt: string | null;
  width: string | null;
  height: string | null;
  linkUrl: string;
  linkTitle: string;
};

export type ExtractedEntity = {
  name: string;
  type: EntityType;
  title?: string;
  description?: string;
  url: string;
  profileImage?: string;
  attribute?: Pick<Attribute, "attributeType" | "value">;
};

export type CrunchbaseData = {
  url: string;
  logo: string;
  name: string;
  "@type": string;
  image: string;
  funder: {
    url: string;
    logo?: string;
    name: string;
    "@type": string;
    image: string;
  }[];
  sameAs: string[];
  address: {
    "@type": string;
    addressRegion: string;
    addressCountry: string;
    addressLocality: string;
  };
  founder: {
    url: string;
    name: string;
    "@type": string;
    image: string;
  }[];
  employee: {
    url: string;
    name: string;
    "@type": string;
    image: string;
    jobTitle: string;
  }[];
  legalName: string;
  description: string;
  foundingDate: string;
};

// Titles should be short, this is safe upper bound.
const MAX_TITLE_LENGTH = 200;

export function getSanitizedTitle(crawlResult: { title?: string } | undefined): string | undefined {
  if (!crawlResult || !crawlResult.title) return undefined;
  return truncate(crawlResult.title, MAX_TITLE_LENGTH);
}

// Descriptions should be short and most social media platforms only show the first 200 characters.
const MAX_DESCRIPTION_LENGTH = 200;

export function getSanitizedDescription(
  crawlResult: { description?: string } | undefined,
): string | undefined {
  if (!crawlResult || !crawlResult.description) return undefined;
  return truncate(crawlResult.description, MAX_DESCRIPTION_LENGTH);
}
