#!/usr/bin/env python3
"""Cross-check the CAUTION over-annotation query hits against the actions that the
existing gene reviews actually assigned.

This validates the queries: a flagged over-annotation that the curator already
set to REMOVE/MARK_AS_OVER_ANNOTATED/MODIFY is a CONFIRMED catch; one left as
ACCEPT/KEEP is either (a) a genuine miss to investigate, or (b) a query
false-positive (the contested CAUTION paper also supports a legitimate, different
annotation, or the parent term is genuinely retained).

Inputs (produced by caution_conjunction_queries.py):
    conjunction_hits.tsv, caution_pmid_unnegated.tsv
Reviews: genes/*/<gene>/<gene>-ai-review.yaml

Output: audit_queries_vs_reviews.md
"""
from __future__ import annotations
import csv, glob, yaml
from pathlib import Path

OUT = Path(__file__).resolve().parent
REPO = OUT.parents[1]
ACTIONED = {"REMOVE", "MARK_AS_OVER_ANNOTATED", "MODIFY"}

def review_actions(gene):
    """term_id -> list of (action, reference_id) for positive (non-negated) anns."""
    fs = glob.glob(str(REPO / f"genes/*/{gene}/{gene}-ai-review.yaml"))
    if not fs:
        return None
    d = yaml.safe_load(Path(fs[0]).read_text())
    m = {}
    for a in d.get("existing_annotations", []):
        if a.get("negated"):
            continue
        m.setdefault(a["term"]["id"], []).append(
            (a.get("review", {}).get("action"), str(a.get("original_reference_id"))))
    return m

def classify(acts):
    if acts is None:
        return "NO_REVIEW"
    if not acts:
        return "NOT_FOUND"
    if all(a in ACTIONED for a in acts):
        return "CONFIRMED"          # query agrees with curator
    return "ACCEPTED"               # flagged but kept -> investigate / false-positive

def main():
    lines = []
    # ---- Query A: STRONG conjunctions (electronic unsupported parent) ----
    a_rows = list(csv.DictReader(open(OUT/"conjunction_hits.tsv"), delimiter="\t"))
    a_strong = [r for r in a_rows if r["pos_evidence"] in ("IEA","IBA") and r["parent_support"]=="UNSUPPORTED"]
    a_stat = {"CONFIRMED":0,"ACCEPTED":0,"NO_REVIEW":0,"NOT_FOUND":0}
    a_detail = []
    for r in a_strong:
        m = review_actions(r["gene"]) or {}
        acts = [a for a,_ in m.get(r["positive_term"], [])]
        c = classify(acts if (review_actions(r["gene"]) is not None) else None)
        a_stat[c]+=1
        a_detail.append((c, r["gene"], r["positive_term"], r["positive_name"], acts))

    # ---- Query B: positive MF terms cited to a contested CAUTION-PMID ----
    b_rows = list(csv.DictReader(open(OUT/"caution_pmid_unnegated.tsv"), delimiter="\t"))
    b_stat = {"CONFIRMED":0,"ACCEPTED":0,"NO_REVIEW":0,"NOT_FOUND":0}
    b_detail, seen = [], set()
    for r in b_rows:
        g, pmid = r["gene"], r["caution_pmid"]
        m = review_actions(g)
        for chunk in r["positive_terms"].split(";"):
            chunk = chunk.strip()
            if not chunk.startswith("GO:") or "(M," not in chunk:  # MF terms only
                continue
            term = chunk.split("(")[0]
            key = (g, term, pmid)
            if key in seen: continue
            seen.add(key)
            acts = [a for a,ref in (m or {}).get(term, []) if pmid in ref]
            if m is not None and not acts:   # term not cited to this PMID in review
                continue
            c = classify(acts if m is not None else None)
            b_stat[c]+=1
            b_detail.append((c, g, pmid, term, acts))

    md = OUT/"audit_queries_vs_reviews.md"
    with md.open("w") as fh:
        fh.write("---\ntitle: \"Audit: CAUTION queries vs existing review actions\"\n---\n\n")
        fh.write("# Audit — do the CAUTION queries agree with curated review actions?\n\n")
        fh.write("Auto-generated by `audit_queries_vs_reviews.py`. Do not edit by hand.\n\n")
        fh.write("`CONFIRMED` = curator already set REMOVE/MARK_AS_OVER_ANNOTATED/MODIFY (query agrees). "
                 "`ACCEPTED` = flagged but kept (investigate, or query false-positive).\n\n")
        fh.write(f"## Query A STRONG conjunctions ({len(a_strong)} flags)\n\n")
        fh.write(f"CONFIRMED={a_stat['CONFIRMED']} ACCEPTED={a_stat['ACCEPTED']} "
                 f"NO_REVIEW={a_stat['NO_REVIEW']}\n\n")
        fh.write("| status | gene | flagged parent term | review action |\n|---|---|---|---|\n")
        for c,g,t,n,acts in sorted(a_detail):
            fh.write(f"| {c} | {g} | {t} {n} | {acts} |\n")
        fh.write(f"\n## Query B MF terms cited to a contested CAUTION-PMID ({b_stat['CONFIRMED']+b_stat['ACCEPTED']} checked)\n\n")
        fh.write(f"CONFIRMED={b_stat['CONFIRMED']} ACCEPTED={b_stat['ACCEPTED']}\n\n")
        fh.write("| status | gene | caution PMID | MF term | review action |\n|---|---|---|---|---|\n")
        for c,g,pmid,t,acts in sorted(b_detail):
            fh.write(f"| {c} | {g} | PMID:{pmid} | {t} | {acts} |\n")
    print(f"Query A STRONG: {a_stat}")
    print(f"Query B MF:     {b_stat}")
    print("wrote", md)

if __name__ == "__main__":
    main()
