{
  "filename": "phylogenetic_tree.png",
  "iteration": 2,
  "description": "Build phylogenetic tree of plant GUF1 paralogs showing chloroplastic vs mitochondrial clades",
  "timestamp": "2026-06-22 09:16:50",
  "code": "\nimport requests\nimport numpy as np\nfrom collections import Counter\n\n# Re-fetch all sequences in a single code block\naccessions_to_fetch = [\n    'A0A2U1PS28', 'A0A2U1PRZ2', 'A0A2U1PCN3',\n    'Q9FNM5', 'Q9FLE4',\n    'K7UFR2', 'A0A804QYQ1',    # Maize chloro, mito\n    'I1KBR9', 'A0A0R0JP46',    # Soybean chloro, mito\n    'B9F2U5', 'Q5VQ69',         # Rice chloro, mito\n    'A0A072V0U5', 'G7KZV0',     # Medicago chloro, mito\n    'A0A3Q7GS73', 'A0A3Q7G7B6', # Tomato chloro, mito\n    'B9GHA6', 'B9IJU6',         # Poplar chloro, mito\n    'A5B4D2', 'F6H5P1',         # Grape chloro, mito\n]\n\nlabels = {\n    'A0A2U1PS28': 'A.annua_TARGET',\n    'A0A2U1PRZ2': 'A.annua_chloro',\n    'A0A2U1PCN3': 'A.annua_mito',\n    'Q9FNM5': 'Arab_chloro*',\n    'Q9FLE4': 'Arab_mito*',\n    'K7UFR2': 'Maize_chloro',\n    'A0A804QYQ1': 'Maize_mito',\n    'I1KBR9': 'Soybean_chloro',\n    'A0A0R0JP46': 'Soybean_mito',\n    'B9F2U5': 'Rice_chloro',\n    'Q5VQ69': 'Rice_mito',\n    'A0A072V0U5': 'Medicago_chloro',\n    'G7KZV0': 'Medicago_mito',\n    'A0A3Q7GS73': 'Tomato_chloro',\n    'A0A3Q7G7B6': 'Tomato_mito',\n    'B9GHA6': 'Poplar_chloro',\n    'B9IJU6': 'Poplar_mito',\n    'A5B4D2': 'Grape_chloro',\n    'F6H5P1': 'Grape_mito',\n}\n\nseqs = {}\nfor acc in accessions_to_fetch:\n    url = \"https://rest.uniprot.org/uniprotkb/\" + acc + \".fasta\"\n    resp = requests.get(url)\n    if resp.status_code == 200:\n        lines = resp.text.strip().split('\\n')\n        seq = ''.join(lines[1:])\n        seqs[acc] = seq\n\nprint(\"Fetched\", len(seqs), \"sequences\")\n\n# Compute pairwise k-mer Jaccard distances\nk = 7\nnames = list(seqs.keys())\nn = len(names)\ndist_matrix = np.zeros((n, n))\n\nfor i in range(n):\n    kmers_i = set(seqs[names[i]][j:j+k] for j in range(len(seqs[names[i]])-k+1))\n    for j in range(i+1, n):\n        kmers_j = set(seqs[names[j]][m:m+k] for m in range(len(seqs[names[j]])-k+1))\n        jaccard = len(kmers_i & kmers_j) / max(1, len(kmers_i | kmers_j))\n        dist = 1 - jaccard\n        dist_matrix[i][j] = dist\n        dist_matrix[j][i] = dist\n\n# Simple neighbor-joining-like clustering using scipy\nfrom scipy.cluster.hierarchy import linkage, dendrogram\nfrom scipy.spatial.distance import squareform\n\n# Convert to condensed form\ncondensed = squareform(dist_matrix)\n\n# UPGMA clustering\nZ = linkage(condensed, method='average')\n\n# Print the clustering result as text dendrogram\nimport matplotlib\nmatplotlib.use('Agg')\nimport matplotlib.pyplot as plt\n\nfig, ax = plt.subplots(figsize=(14, 10))\nleaf_labels = [labels.get(names[i], names[i]) for i in range(n)]\n\n# Color chloroplastic entries green and mitochondrial red\ndef get_leaf_color(label):\n    if 'chloro' in label.lower() or 'TARGET' in label:\n        return 'green'\n    elif 'mito' in label.lower():\n        return 'red'\n    return 'black'\n\ndn = dendrogram(Z, labels=leaf_labels, orientation='left', ax=ax, leaf_font_size=11)\n\nax.set_title('Phylogenetic Clustering of Plant GUF1/EF-4 Paralogs\\n(k=7 Jaccard distance, UPGMA)', fontsize=14)\nax.set_xlabel('K-mer Jaccard Distance', fontsize=12)\n\n# Color the labels\nylbls = ax.get_yticklabels()\nfor lbl in ylbls:\n    txt = lbl.get_text()\n    if 'chloro' in txt.lower() or 'TARGET' in txt:\n        lbl.set_color('darkgreen')\n        if 'TARGET' in txt:\n            lbl.set_fontweight('bold')\n            lbl.set_fontsize(13)\n    elif 'mito' in txt.lower():\n        lbl.set_color('darkred')\n\n# Add legend\nfrom matplotlib.patches import Patch\nlegend_elements = [\n    Patch(facecolor='darkgreen', label='Chloroplastic paralogs'),\n    Patch(facecolor='darkred', label='Mitochondrial paralogs'),\n]\nax.legend(handles=legend_elements, loc='lower right', fontsize=11)\n\nplt.tight_layout()\nplt.savefig('phylogenetic_tree.png', dpi=150, bbox_inches='tight')\nplt.show()\nprint(\"\\nPhylogenetic tree saved.\")\n\n# Print distance from TARGET to all others\nprint(\"\\n=== Distances from A0A2U1PS28 (TARGET) ===\")\ntarget_idx = names.index('A0A2U1PS28')\ndistances = []\nfor i in range(n):\n    if i != target_idx:\n        distances.append((labels.get(names[i], names[i]), dist_matrix[target_idx][i]))\ndistances.sort(key=lambda x: x[1])\nfor lab, d in distances:\n    marker = \" <-- CLOSEST\" if d == distances[0][1] else \"\"\n    print(\"  \" + lab.ljust(25) + \": \" + str(round(d, 4)) + marker)\n"
}