/* jedai-validator.jsx — deterministic decodability validator.
   Pure JS, no React. Same code is mirrored in functions/lib/validator.js
   so the server can run the same checks. If you change the rules here,
   change them there too.

   Exposed as window.JEDAI_VALIDATOR.

   THIS IS THE TRUTH SOURCE. Rule checks must NEVER use LLMs — LLMs
   rubber-stamp and drift. Code doesn't.
*/

(function () {
  /* Universe of grapheme units (longest first for greedy parse).
     This is *all possible* graphemes — not what's "in scope" for a given
     week. The scope-and-sequence (passed in) narrows the taught set.
     Magic-e split digraphs (a_e etc.) are detected separately by pattern.
     Suffixes (-ed, -ing) are detected at word end. */
  const TRIGRAPHS = ["igh", "tch", "dge", "ear", "air", "are", "ore", "our"];
  const DIGRAPHS = [
    "ai","ay","au","aw","ea","ee","ei","ey","ie","oa","oe","oi","oo","ou","ow","oy","ue","ui","ew",
    "ar","er","ir","or","ur",
    "ch","ck","gh","kn","ng","ph","qu","sh","th","wh","wr",
    "ff","ll","ss","zz",
    /* Soft-c / soft-g positional digraphs (introduced ~ week 24).
       Greedy parser will match these when they appear; in non-soft contexts
       like "cat" the parser picks single 'c' because no `ca` digraph exists. */
    "ce","ci","cy","ge","gi"
  ];
  const SINGLE_LETTERS = "abcdefghijklmnopqrstuvwxyz".split("");

  const VOWELS = new Set(["a","e","i","o","u","y"]);
  const isVowel = (ch) => VOWELS.has(ch);
  const isConsonant = (ch) => /[a-z]/.test(ch) && !VOWELS.has(ch);

  /* Common contractions — accept as sight-word-equivalents if their stem
     is decodable. The apostrophe form is what appears in passages. */
  const COMMON_CONTRACTIONS = new Set([
    "don't","won't","can't","isn't","wasn't","weren't","aren't","didn't",
    "haven't","hasn't","hadn't","wouldn't","couldn't","shouldn't",
    "i'm","i'll","i've","i'd","you're","you'll","you've","you'd",
    "he's","she's","he'll","she'll","it's","we're","we'll","we've","we'd",
    "they're","they'll","they've","they'd","that's","there's","here's",
    "what's","who's","where's","let's"
  ]);

  /* Sights-as-an-array of irregular high-frequency words that aren't
     fully decodable through standard scope. Borrowed from Fry top 100. */
  const HARD_IRREGULAR_SIGHT = new Set([
    "the","a","of","to","you","is","are","was","were","said","have","has",
    "do","does","could","would","should","they","there","their","one","two",
    "from","what","who","why","where","because","been","goes","gone",
    "any","many","every","very","want","into","through","again","other","love",
    "people","water","mother","father","sister","brother","friend","laugh"
  ]);

  /* =========================================================
     Build a "taught" map from the passed scope-and-sequence cutoff
     ========================================================= */

  function buildScope(sequence, throughWeek) {
    const taughtSimple = new Set();   // single, di-, tri-graphs
    const taughtMagicE = new Set();   // "a_e", "i_e", ...
    const taughtSuffix = new Set();   // "-ed", "-ing", "-s"
    const taughtBlends = new Set();   // not used for validation, but tracked for display
    // Mixed-grade sequences (from getCumulativeScope) tag each row with .grade.
    // For those, the throughWeek filter has already been applied at construction.
    const isMixedGrade = Array.isArray(sequence) && sequence.length > 0 && sequence[0].grade;

    for (const row of sequence) {
      if (!isMixedGrade && row.week > throughWeek) break;
      for (const u of row.new) {
        const g = String(u.g).toLowerCase();
        if (g.includes("_e")) {
          taughtMagicE.add(g.replace("_e", ""));   // store the vowel: "a", "i", etc.
        } else if (g.startsWith("-")) {
          taughtSuffix.add(g.slice(1));
        } else if (g.endsWith("-")) {
          taughtBlends.add(g.slice(0, -1));
        } else if (g === "y_long_i" || g === "y_long_e") {
          taughtSimple.add("y");
        } else {
          taughtSimple.add(g);
        }
      }
    }
    return { taughtSimple, taughtMagicE, taughtSuffix, taughtBlends };
  }

  /* =========================================================
     Parse a single word into grapheme units against a scope.
     Returns { parts: [{ g, taught, kind }], allTaught, untaught: [g,...] }
     kind: 'grapheme' | 'magic-e' | 'suffix' | 'split-vowel-consonant'
     ========================================================= */

  function parseWord(rawWord, scope) {
    const word = rawWord.toLowerCase().replace(/[^a-z']/g, "");
    if (!word) return { parts: [], allTaught: true, untaught: [] };

    /* Step 1 — strip a recognized suffix at the end (only one). */
    let suffix = null;
    const suffixCandidates = ["ing", "est", "ed", "es", "er", "ly", "s"];
    for (const sx of suffixCandidates) {
      if (word.length > sx.length + 1 && word.endsWith(sx)) {
        suffix = sx;
        break;
      }
    }

    const stem = suffix ? word.slice(0, word.length - suffix.length) : word;
    const parts = [];

    /* Step 2 — detect magic-e at the end of the stem.
       Pattern: V + (single C, may be a digraph like ck/th) + e at very end.
       e.g. "cake" → a_e + k (silent e)
            "shake" → sh + a_e + k
            "drive" → d + r + i_e + v
       We only mark the vowel as `<v>_e`; we still emit the consonant separately. */
    let parsable = stem;
    let trailingSilentE = false;
    if (
      stem.length >= 3 &&
      stem.endsWith("e") &&
      isConsonant(stem[stem.length - 2]) &&
      isVowel(stem[stem.length - 3])
    ) {
      const vowel = stem[stem.length - 3];
      if (scope.taughtMagicE.has(vowel)) {
        // mark the silent-e
        trailingSilentE = true;
        // We'll parse the stem normally but when we reach the vowel position
        // we'll consume `vowel`+(skipped silent-e) as one magic-e unit.
      }
    }

    /* Step 3 — greedy left-to-right parse of stem. */
    let i = 0;
    while (i < stem.length) {
      // Skip the trailing silent-e: handled as part of magic-e
      if (trailingSilentE && i === stem.length - 1) {
        // emit "silent-e" marker (not a grapheme to validate)
        parts.push({ g: "e(silent)", taught: true, kind: "silent" });
        i += 1;
        continue;
      }

      // Magic-e: at the vowel position, look ahead for C + e at end
      if (
        trailingSilentE &&
        i === stem.length - 3 &&
        isVowel(stem[i])
      ) {
        const vowel = stem[i];
        parts.push({ g: vowel + "_e", taught: scope.taughtMagicE.has(vowel), kind: "magic-e" });
        i += 1;
        continue;
      }

      // Try 3-letter grapheme
      if (i + 3 <= stem.length) {
        const tri = stem.slice(i, i + 3);
        if (TRIGRAPHS.includes(tri)) {
          parts.push({ g: tri, taught: scope.taughtSimple.has(tri), kind: "grapheme" });
          i += 3;
          continue;
        }
      }

      // Try 2-letter grapheme
      if (i + 2 <= stem.length) {
        const di = stem.slice(i, i + 2);
        if (DIGRAPHS.includes(di)) {
          parts.push({ g: di, taught: scope.taughtSimple.has(di), kind: "grapheme" });
          i += 2;
          continue;
        }
      }

      // Fall back to single letter
      const single = stem[i];
      if (single === "'") {
        // apostrophe — skip (contractions handled elsewhere)
        i += 1;
        continue;
      }
      parts.push({ g: single, taught: scope.taughtSimple.has(single), kind: "grapheme" });
      i += 1;
    }

    /* Step 4 — suffix. */
    if (suffix) {
      parts.push({
        g: "-" + suffix,
        taught: scope.taughtSuffix.has(suffix),
        kind: "suffix"
      });
    }

    const untaught = parts
      .filter((p) => !p.taught && p.kind !== "silent")
      .map((p) => p.g);
    const allTaught = untaught.length === 0;

    return { parts, allTaught, untaught };
  }

  /* =========================================================
     Validate a passage against a spec.
     spec: {
       week, targetWords:[...], wordCount, maxSentenceLen,
       sightWordCap, decodabilityFloor, minTargetHits
     }
     sequence: SCOPE_AND_SEQUENCE array
     Returns: { passed, checks: [...], stats: {...}, annotated: [...] }
     ========================================================= */

  function validatePassage(text, spec, sequence) {
    const scope = buildScope(sequence, spec.week);
    const tokens = (text.toLowerCase().match(/[a-z]+(?:'[a-z]+)?/g) || []);
    const distinctTokens = Array.from(new Set(tokens));
    const targets = (spec.targetWords || []).map((w) => w.toLowerCase().trim()).filter(Boolean);
    const fry = window.JEDAI_DATA ? window.JEDAI_DATA.FRY_SET : new Set();

    /* Classify each distinct token */
    const tokenInfo = {};
    let decodableCount = 0;
    let sightCount = 0;
    const untaughtMap = {};   // grapheme -> example words
    const untaughtWords = [];
    const sightUsed = new Set();

    for (const tok of distinctTokens) {
      // Contraction handled as a sight word if in COMMON_CONTRACTIONS
      if (COMMON_CONTRACTIONS.has(tok)) {
        tokenInfo[tok] = { kind: "contraction", taught: true };
        sightUsed.add(tok);
        continue;
      }
      if (HARD_IRREGULAR_SIGHT.has(tok)) {
        tokenInfo[tok] = { kind: "sight", taught: true };
        sightUsed.add(tok);
        continue;
      }
      const parsed = parseWord(tok, scope);
      if (parsed.allTaught) {
        tokenInfo[tok] = { kind: "decodable", taught: true, parts: parsed.parts };
        decodableCount += 1;
        continue;
      }
      if (fry.has(tok)) {
        tokenInfo[tok] = { kind: "sight", taught: true };
        sightUsed.add(tok);
        continue;
      }
      tokenInfo[tok] = { kind: "untaught", taught: false, parts: parsed.parts, untaught: parsed.untaught };
      untaughtWords.push(tok);
      for (const g of parsed.untaught) {
        (untaughtMap[g] = untaughtMap[g] || []).push(tok);
      }
    }

    sightCount = sightUsed.size;
    const totalDistinct = distinctTokens.length;
    const decodabilityPct = totalDistinct === 0 ? 0 :
      Math.round(100 * decodableCount / Math.max(1, totalDistinct - sightUsed.size));

    /* Run rule checks */
    const checks = [];

    // 1. Target words inclusion
    if (targets.length) {
      const missing = targets.filter((w) => !tokens.includes(w));
      const minHits = spec.minTargetHits || targets.length;
      const hits = targets.length - missing.length;
      checks.push({
        id: "target-words",
        label: `Target words present (${hits}/${targets.length})`,
        pass: hits >= minHits,
        detail: missing.length ? `Missing: ${missing.join(", ")}` : ""
      });
    }

    // 2. Word count
    const totalWords = tokens.length;
    const wcTarget = spec.wordCount || 0;
    if (wcTarget) {
      const minWc = Math.max(20, Math.floor(wcTarget * 0.8));
      const maxWc = Math.ceil(wcTarget * 1.25);
      checks.push({
        id: "word-count",
        label: `Word count in range (${totalWords} words, want ~${wcTarget})`,
        pass: totalWords >= minWc && totalWords <= maxWc,
        detail: totalWords < minWc ? `Too short — minimum ${minWc}` :
                totalWords > maxWc ? `Too long — maximum ${maxWc}` : ""
      });
    }

    // 3. Max sentence length
    const sentenceWords = (window.JEDAI_DATA ? window.JEDAI_DATA.sentenceLengths(text) : []);
    const longSentences = sentenceWords.filter((n) => n > (spec.maxSentenceLen || 999));
    if (spec.maxSentenceLen) {
      checks.push({
        id: "sentence-length",
        label: `All sentences ≤ ${spec.maxSentenceLen} words`,
        pass: longSentences.length === 0,
        detail: longSentences.length ? `${longSentences.length} sentence(s) too long (max found: ${Math.max(...longSentences)})` : ""
      });
    }

    // 4. Decodability floor
    if (spec.decodabilityFloor) {
      checks.push({
        id: "decodability",
        label: `Decodability ≥ ${spec.decodabilityFloor}% (got ${decodabilityPct}%)`,
        pass: decodabilityPct >= spec.decodabilityFloor,
        detail: untaughtWords.length
          ? `${untaughtWords.length} untaught word(s): ${untaughtWords.slice(0, 6).join(", ")}${untaughtWords.length > 6 ? "…" : ""}`
          : ""
      });
    }

    // 5. Sight word cap
    if (spec.sightWordCap) {
      checks.push({
        id: "sight-cap",
        label: `Sight words ≤ ${spec.sightWordCap} distinct (got ${sightCount})`,
        pass: sightCount <= spec.sightWordCap,
        detail: sightCount > spec.sightWordCap
          ? `Trim sight words: ${Array.from(sightUsed).slice(0, 10).join(", ")}…`
          : ""
      });
    }

    // 6. Word repetition cap — no content word more than 3×
    const wordFreq = {};
    for (const tok of tokens) wordFreq[tok] = (wordFreq[tok] || 0) + 1;
    const targetSet = new Set(targets);
    const overused = Object.entries(wordFreq)
      .filter(([w, n]) => n > 5 && !HARD_IRREGULAR_SIGHT.has(w) && !targetSet.has(w) && w.length > 3)
      .sort((a, b) => b[1] - a[1]);
    if (overused.length) {
      checks.push({
        id: "word-repetition",
        label: `No word used more than 5× (found ${overused.length})`,
        pass: false,
        detail: overused.slice(0, 5).map(([w, n]) => `"${w}" ${n}×`).join(", ")
      });
    }

    // 7. Untaught grapheme summary (informational, not a hard pass/fail beyond decodability)
    const untaughtGraphemeList = Object.entries(untaughtMap).map(([g, words]) => ({
      g,
      examples: words.slice(0, 4)
    }));
    if (untaughtGraphemeList.length) {
      checks.push({
        id: "untaught-graphemes",
        label: `No untaught graphemes (found ${untaughtGraphemeList.length})`,
        pass: false,
        detail: untaughtGraphemeList.slice(0, 5).map((u) => `"${u.g}" in ${u.examples.join("/")}`).join(" · ")
      });
    }

    const passed = checks.every((c) => c.pass);

    return {
      passed,
      checks,
      stats: {
        totalWords,
        distinctTokens: totalDistinct,
        decodabilityPct,
        sightCount,
        sightUsed: Array.from(sightUsed),
        untaughtWords,
        untaughtGraphemes: untaughtGraphemeList
      },
      tokenInfo
    };
  }

  /* =========================================================
     Render-helper: annotate text with span markers (return array
     of {text, kind} chunks suitable for React rendering).
     kinds: 'target', 'sight', 'untaught', 'plain'
     ========================================================= */

  function annotatePassage(text, spec, sequence) {
    const result = validatePassage(text, spec, sequence);
    const targets = new Set((spec.targetWords || []).map((w) => w.toLowerCase()));
    const tokenInfo = result.tokenInfo;
    const chunks = [];
    const re = /([A-Za-z]+(?:'[A-Za-z]+)?|[^A-Za-z]+)/g;
    let m;
    while ((m = re.exec(text)) !== null) {
      const piece = m[0];
      if (/^[A-Za-z]/.test(piece)) {
        const lower = piece.toLowerCase();
        let kind = "plain";
        if (targets.has(lower)) kind = "target";
        else if (tokenInfo[lower]?.kind === "sight" || tokenInfo[lower]?.kind === "contraction") kind = "sight";
        else if (tokenInfo[lower]?.kind === "untaught") kind = "untaught";
        chunks.push({ text: piece, kind });
      } else {
        chunks.push({ text: piece, kind: "plain" });
      }
    }
    return { chunks, result };
  }

  /* =========================================================
     Export
     ========================================================= */

  window.JEDAI_VALIDATOR = {
    buildScope,
    parseWord,
    validatePassage,
    annotatePassage,
    TRIGRAPHS,
    DIGRAPHS,
    COMMON_CONTRACTIONS,
    HARD_IRREGULAR_SIGHT
  };
})();
