Coverage for bloodhound/markers.py : 95.10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import os
2import subprocess
3from typing import Dict, List, Tuple
4import gzip
5import shutil
6import tempfile
7from rich.progress import (
8 Progress,
9 TextColumn,
10 BarColumn,
11 TaskProgressColumn,
12 TimeRemainingColumn,
13)
15# Marker information
16BAC120_MARKERS = [
17 "PF00380.20",
18 "PF00410.20",
19 "PF00466.21",
20 "PF01025.20",
21 "PF02576.18",
22 "PF03726.15",
23 "TIGR00006",
24 "TIGR00019",
25 "TIGR00020",
26 "TIGR00029",
27 "TIGR00043",
28 "TIGR00054",
29 "TIGR00059",
30 "TIGR00061",
31 "TIGR00064",
32 "TIGR00065",
33 "TIGR00082",
34 "TIGR00083",
35 "TIGR00084",
36 "TIGR00086",
37 "TIGR00088",
38 "TIGR00090",
39 "TIGR00092",
40 "TIGR00095",
41 "TIGR00115",
42 "TIGR00116",
43 "TIGR00138",
44 "TIGR00158",
45 "TIGR00166",
46 "TIGR00168",
47 "TIGR00186",
48 "TIGR00194",
49 "TIGR00250",
50 "TIGR00337",
51 "TIGR00344",
52 "TIGR00362",
53 "TIGR00382",
54 "TIGR00392",
55 "TIGR00396",
56 "TIGR00398",
57 "TIGR00414",
58 "TIGR00416",
59 "TIGR00420",
60 "TIGR00431",
61 "TIGR00435",
62 "TIGR00436",
63 "TIGR00442",
64 "TIGR00445",
65 "TIGR00456",
66 "TIGR00459",
67 "TIGR00460",
68 "TIGR00468",
69 "TIGR00472",
70 "TIGR00487",
71 "TIGR00496",
72 "TIGR00539",
73 "TIGR00580",
74 "TIGR00593",
75 "TIGR00615",
76 "TIGR00631",
77 "TIGR00634",
78 "TIGR00635",
79 "TIGR00643",
80 "TIGR00663",
81 "TIGR00717",
82 "TIGR00755",
83 "TIGR00810",
84 "TIGR00922",
85 "TIGR00928",
86 "TIGR00959",
87 "TIGR00963",
88 "TIGR00964",
89 "TIGR00967",
90 "TIGR01009",
91 "TIGR01011",
92 "TIGR01017",
93 "TIGR01021",
94 "TIGR01029",
95 "TIGR01032",
96 "TIGR01039",
97 "TIGR01044",
98 "TIGR01059",
99 "TIGR01063",
100 "TIGR01066",
101 "TIGR01071",
102 "TIGR01079",
103 "TIGR01082",
104 "TIGR01087",
105 "TIGR01128",
106 "TIGR01146",
107 "TIGR01164",
108 "TIGR01169",
109 "TIGR01171",
110 "TIGR01302",
111 "TIGR01391",
112 "TIGR01393",
113 "TIGR01394",
114 "TIGR01510",
115 "TIGR01632",
116 "TIGR01951",
117 "TIGR01953",
118 "TIGR02012",
119 "TIGR02013",
120 "TIGR02027",
121 "TIGR02075",
122 "TIGR02191",
123 "TIGR02273",
124 "TIGR02350",
125 "TIGR02386",
126 "TIGR02397",
127 "TIGR02432",
128 "TIGR02729",
129 "TIGR03263",
130 "TIGR03594",
131 "TIGR03625",
132 "TIGR03632",
133 "TIGR03654",
134 "TIGR03723",
135 "TIGR03725",
136 "TIGR03953",
137]
139AR53_MARKERS = [
140 "PF04919.13",
141 "PF07541.13",
142 "PF01000.27",
143 "PF00687.22",
144 "PF00466.21",
145 "PF00827.18",
146 "PF01280.21",
147 "PF01090.20",
148 "PF01200.19",
149 "PF01015.19",
150 "PF00900.21",
151 "PF00410.20",
152 "TIGR00037",
153 "TIGR00064",
154 "TIGR00111",
155 "TIGR00134",
156 "TIGR00279",
157 "TIGR00291",
158 "TIGR00323",
159 "TIGR00335",
160 "TIGR00373",
161 "TIGR00405",
162 "TIGR00448",
163 "TIGR00483",
164 "TIGR00491",
165 "TIGR00522",
166 "TIGR00967",
167 "TIGR00982",
168 "TIGR01008",
169 "TIGR01012",
170 "TIGR01018",
171 "TIGR01020",
172 "TIGR01028",
173 "TIGR01046",
174 "TIGR01052",
175 "TIGR01171",
176 "TIGR01213",
177 "TIGR01952",
178 "TIGR02236",
179 "TIGR02338",
180 "TIGR02389",
181 "TIGR02390",
182 "TIGR03626",
183 "TIGR03627",
184 "TIGR03628",
185 "TIGR03629",
186 "TIGR03670",
187 "TIGR03671",
188 "TIGR03672",
189 "TIGR03673",
190 "TIGR03674",
191 "TIGR03676",
192 "TIGR03680",
193]
196def read_fasta(path: str) -> Dict[str, str]:
197 """
198 Read a FASTA file into a dictionary of sequences.
200 Parameters
201 ----------
202 path : str
203 Filesystem path to the input FASTA file.
205 Returns
206 -------
207 Dict[str, str]
208 Mapping from sequence ID (the first token after ‘>’ in the header)
209 to the full sequence string, with any terminal “*” characters stripped.
211 Raises
212 ------
213 IOError
214 If the file cannot be opened for reading.
215 """
216 seqs: Dict[str, str] = {}
217 with open(path) as fh:
218 header, buffer = None, []
219 for line in fh:
220 line = line.rstrip()
221 if not line:
222 continue
223 if line.startswith(">"):
224 if header:
225 seqs[header] = "".join(buffer).strip("*")
226 header = line[1:].split()[0]
227 buffer = []
228 else:
229 buffer.append(line)
230 if header:
231 seqs[header] = "".join(buffer).strip("*")
232 return seqs
235def run_prodigal(
236 genome_id: str, fasta_path: str, out_dir: str, force: bool = False
237) -> str:
238 """
239 Run Prodigal to predict protein translations from a genome FASTA.
241 Parameters
242 ----------
243 genome_id : str
244 Identifier for this genome; used to name output files.
245 fasta_path : str
246 Path to the input genome FASTA file.
247 out_dir : str
248 Directory under which a subdirectory named `genome_id` will be created
249 (if necessary) to hold the predicted proteins.
250 force : bool, optional
251 If True and the output file already exists, it will be removed and
252 regenerated. Default is False.
254 Returns
255 -------
256 str
257 Path to the predicted protein FASTA (`.faa`) file.
259 Raises
260 ------
261 RuntimeError
262 If the `prodigal` executable cannot be found in `PATH`.
263 subprocess.CalledProcessError
264 If Prodigal exits with a non-zero status.
265 """
266 prot_dir = os.path.join(out_dir, genome_id)
267 os.makedirs(prot_dir, exist_ok=True)
268 prot_fa = os.path.join(prot_dir, f"{genome_id}.faa")
270 if force and os.path.exists(prot_fa):
271 os.remove(prot_fa)
273 # check if gzipped
274 if fasta_path.endswith(".gz"):
275 with gzip.open(fasta_path, "rt") as gz_in, tempfile.NamedTemporaryFile(
276 mode="w+", delete=False, suffix=".fasta"
277 ) as tmp_fa:
278 shutil.copyfileobj(gz_in, tmp_fa)
279 tmp_fa_path = tmp_fa.name
280 input_path = tmp_fa_path
281 else:
282 input_path = fasta_path
283 subprocess.run(
284 ["prodigal", "-a", prot_fa, "-p", "meta", "-i", input_path],
285 check=True,
286 stdout=subprocess.DEVNULL,
287 stderr=subprocess.DEVNULL,
288 )
289 return prot_fa
292def parse_domtblout_top_hits(domtbl_path: str) -> Dict[str, List[str]]:
293 """
294 Parse a HMMER --domtblout file and select the top hit per sequence.
296 Implements the GTDB-Tk comparator logic: for each query sequence,
297 keeps the hit with highest bitscore; ties broken by lower e-value,
298 then by lexicographically smaller HMM ID.
300 Parameters
301 ----------
302 domtbl_path : str
303 Path to the HMMER --domtblout output file.
305 Returns
306 -------
307 Dict[str, List[str]]
308 Mapping from HMM ID to a list of sequence IDs that were chosen
309 as top hit(s) for that HMM.
311 Raises
312 ------
313 IOError
314 If the domtblout file cannot be opened.
315 ValueError
316 If a non-numeric e-value or bitscore is encountered.
317 """
318 seq_matches: Dict[str, Tuple[str, float, float]] = {}
319 with open(domtbl_path) as fh:
320 for line in fh:
321 if line.startswith("#"):
322 continue
323 parts = line.split()
324 seq_id = parts[0]
325 hmm_id = parts[3]
326 evalue = float(parts[4])
327 bitscore = float(parts[5])
328 # look for the best hit for this sequence
329 prev = seq_matches.get(seq_id)
330 if prev is None:
331 seq_matches[seq_id] = (hmm_id, bitscore, evalue)
332 else:
333 # only keep the best hit
334 prev_hmm_id, prev_b, prev_e = prev
335 if (
336 bitscore > prev_b
337 or (bitscore == prev_b and evalue < prev_e)
338 or (
339 bitscore == prev_b and evalue == prev_e and hmm_id < prev_hmm_id
340 )
341 ):
342 seq_matches[seq_id] = (hmm_id, bitscore, evalue)
344 # now, invert the mapping to get HMM IDs to sequences
345 hits: Dict[str, List[str]] = {}
346 for seq_id, (hmm_id, _, _) in seq_matches.items():
347 hits.setdefault(hmm_id, []).append(seq_id)
348 return hits
351def extract_single_copy_markers(
352 genomes: Dict[str, str],
353 out_dir: str,
354 cpus: int = 1,
355 pfam_db: str = os.environ.get("PFAM_HMMDB"),
356 tigr_db: str = os.environ.get("TIGR_HMMDB"),
357 force: bool = False,
358) -> Dict[str, Dict[str, List[str]]]:
360 out_fastas = { gid: {"bac120": [], "ar53": []} for gid in genomes }
361 total = len(genomes)
363 with Progress(
364 TextColumn("[cyan]{task.description}"),
365 BarColumn(),
366 TaskProgressColumn(),
367 TimeRemainingColumn(),
368 ) as progress:
369 task_pred = progress.add_task("Prodigal", total=total)
370 task_pfam = progress.add_task("Pfam HMMs", total=total)
371 task_tigr = progress.add_task("TIGRFAM HMMs", total=total)
372 task_fasta = progress.add_task("Writing FASTAs", total=total)
374 for gid, path in genomes.items():
375 progress.console.print(f"Processing genome {gid} from {path}")
376 # 1) Prodigal
377 prot_fa = run_prodigal(gid, path, out_dir, force=force)
378 prot_seqs = read_fasta(prot_fa)
379 progress.update(task_pred, advance=1)
381 # 2) Pfam search
382 pf_out = os.path.join(out_dir, gid, "pfam.tblout")
383 if force and os.path.exists(pf_out): os.remove(pf_out)
384 subprocess.run(
385 ["hmmsearch","--cpu",str(cpus),
386 "--notextw","-E","0.001","--domE","0.001",
387 "--tblout",pf_out, pfam_db, prot_fa],
388 check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
389 )
390 progress.update(task_pfam, advance=1)
392 # 3) TIGRFAM search
393 tg_out = os.path.join(out_dir, gid, "tigrfam.tblout")
394 if force and os.path.exists(tg_out): os.remove(tg_out)
395 subprocess.run(
396 ["hmmsearch","--cpu",str(cpus),
397 "--noali","--notextw","--cut_nc",
398 "--tblout",tg_out, tigr_db, prot_fa],
399 check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
400 )
401 progress.update(task_tigr, advance=1)
403 # 4) Parse & merge
404 hits = parse_domtblout_top_hits(pf_out) | parse_domtblout_top_hits(tg_out)
406 # 5) Write out the sequences
407 for marker, recs in hits.items():
408 if len(recs) == 0:
409 # no hits
410 continue
411 elif len(recs) == 1:
412 # unique hit
413 seq = prot_seqs[recs[0]]
414 else:
415 # multiple hits
416 # check if they’re all identical
417 seqs = set(prot_seqs[s] for s in recs)
418 if len(seqs) != 1:
419 # not identical, so skip
420 continue
421 # identical, so count it as a unique hit
422 seq = seqs[0]
424 for dom in ("bac120", "ar53"):
425 # write out the sequences to separate directories for each domain
426 if (dom == "bac120" and marker in BAC120_MARKERS) or \
427 (dom == "ar53" and marker in AR53_MARKERS):
428 genome_dir = os.path.join(out_dir, gid, dom)
429 os.makedirs(genome_dir, exist_ok=True)
430 fa_path = os.path.join(genome_dir, f"{marker}.fa")
431 with open(fa_path, "w") as fh:
432 fh.write(f">{gid}\n{seq}\n")
433 out_fastas[gid][dom].append(fa_path)
435 progress.update(task_fasta, advance=1)
437 return out_fastas