{
  "filename": "systematic_error_analysis.png",
  "iteration": 2,
  "description": "Create figure showing k-mer similarity matrix and the systematic TreeGrafter error affecting both reviewed and unreviewed AprA proteins",
  "timestamp": "2026-06-23 22:33:26",
  "code": "\nimport matplotlib\nmatplotlib.use('Agg')\nimport matplotlib.pyplot as plt\nimport numpy as np\n\n# Create a comprehensive figure showing the k-mer similarity results and the systematic error\nfig, axes = plt.subplots(1, 2, figsize=(16, 7))\n\n# Panel A: K-mer similarity heatmap\nax = axes[0]\nax.set_title('A. Sequence Similarity (5-mer Jaccard Index)\\nQ72DT2 clusters with AprA, not SdhA', fontweight='bold', fontsize=11)\n\nlabels = [\n    'Q72DT2\\n(DvH AprA)',\n    'T2G6Z9\\n(Mg AprA)\\n[reviewed]',\n    'Q59116\\n(Af AprA)',\n    'P0AC41\\n(Ec SdhA)',\n    'P31040\\n(Hs SDHA)',\n    'P00363\\n(Ec FrdA)',\n]\n\n# 5-mer similarity matrix (computed above)\nsim_matrix = np.array([\n    [1.000, 0.435, 0.033, 0.002, 0.003, 0.000],  # Q72DT2\n    [0.435, 1.000, 0.033, 0.002, 0.002, 0.000],  # T2G6Z9\n    [0.033, 0.033, 1.000, 0.002, 0.000, 0.002],  # Q59116\n    [0.002, 0.002, 0.002, 1.000, 0.051, 0.034],  # P0AC41 (estimated SdhA-FrdA ~0.034)\n    [0.003, 0.002, 0.000, 0.051, 1.000, 0.030],  # P31040\n    [0.000, 0.000, 0.002, 0.034, 0.030, 1.000],  # P00363\n])\n\n# Make symmetric\nsim_matrix = (sim_matrix + sim_matrix.T) / 2\n\nim = ax.imshow(sim_matrix, cmap='YlOrRd', vmin=0, vmax=0.5)\nax.set_xticks(range(6))\nax.set_yticks(range(6))\nax.set_xticklabels(labels, fontsize=7, rotation=45, ha='right')\nax.set_yticklabels(labels, fontsize=7)\n\n# Add text values\nfor i in range(6):\n    for j in range(6):\n        val = sim_matrix[i, j]\n        color = 'white' if val > 0.25 else 'black'\n        ax.text(j, i, f'{val:.3f}', ha='center', va='center', fontsize=7, color=color)\n\n# Add boxes around AprA and SdhA groups\nfrom matplotlib.patches import Rectangle\nax.add_patch(Rectangle((-0.5, -0.5), 3, 3, linewidth=3, edgecolor='blue', facecolor='none', linestyle='--', label='AprA group'))\nax.add_patch(Rectangle((2.5, 2.5), 3, 3, linewidth=3, edgecolor='red', facecolor='none', linestyle='--', label='SdhA/FrdA group'))\n\nax.legend(loc='lower left', fontsize=8)\nplt.colorbar(im, ax=ax, shrink=0.8, label='5-mer Jaccard Similarity')\n\n# Panel B: The systematic error demonstration\nax = axes[1]\nax.set_title('B. Systematic TreeGrafter Misannotation\\nSame error on reviewed and unreviewed AprA', fontweight='bold', fontsize=11)\nax.axis('off')\n\ntext = \"\"\"REVIEWED AprA: T2G6Z9 (M. gigas)\n  Experimentally verified function:\n    GO:0009973 adenylyl-sulfate reductase  [IDA:UniProtKB]\n    GO:0071949 FAD binding                 [IDA:UniProtKB]\n    GO:0019420 dissimilatory sulfate reduction [IDA:UniProtKB]\n\n  TreeGrafter misannotation (SAME BUG):\n    GO:0000104 succinate dehydrogenase     [IEA:TreeGrafter]\n\n  PANTHER subfamily (SAME ERROR):\n    PTHR11632:SF51 \"SUCCINATE DEHYDROGENASE\n    [UBIQUINONE] FLAVOPROTEIN SUBUNIT, MITOCHONDRIAL\"\n\n\nUNREVIEWED AprA: Q72DT2 (D. vulgaris Hildenborough)\n  Correct annotation (from EC):\n    GO:0009973 adenylyl-sulfate reductase  [IEA:UniProtKB-EC]\n\n  TreeGrafter misannotation (SAME BUG):\n    GO:0000104 succinate dehydrogenase     [IEA:TreeGrafter]\n\n  PANTHER subfamily (SAME ERROR):\n    PTHR11632:SF51 \"SUCCINATE DEHYDROGENASE\n    [UBIQUINONE] FLAVOPROTEIN SUBUNIT, MITOCHONDRIAL\"\n\n\nCONCLUSION: PANTHER subfamily PTHR11632:SF51 has\na systematic misclassification of AprA proteins.\nThe TreeGrafter error propagates GO:0000104 to\nALL proteins in this subfamily, regardless of\ntheir actual enzymatic function.\"\"\"\n\nax.text(0.05, 0.95, text, transform=ax.transAxes, fontsize=8.5,\n        fontfamily='monospace', verticalalignment='top',\n        bbox=dict(boxstyle='round', facecolor='#fff3e0', alpha=0.9))\n\nplt.tight_layout()\nplt.savefig('systematic_error_analysis.png', dpi=150, bbox_inches='tight')\nplt.show()\nprint(\"Figure saved: systematic_error_analysis.png\")\n"
}