Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import os 

2import subprocess 

3from typing import Dict, List, Tuple 

4import gzip 

5import shutil 

6import tempfile 

7from rich.progress import ( 

8 Progress, 

9 TextColumn, 

10 BarColumn, 

11 TaskProgressColumn, 

12 TimeRemainingColumn, 

13) 

14 

15# Marker information 

16BAC120_MARKERS = [ 

17 "PF00380.20", 

18 "PF00410.20", 

19 "PF00466.21", 

20 "PF01025.20", 

21 "PF02576.18", 

22 "PF03726.15", 

23 "TIGR00006", 

24 "TIGR00019", 

25 "TIGR00020", 

26 "TIGR00029", 

27 "TIGR00043", 

28 "TIGR00054", 

29 "TIGR00059", 

30 "TIGR00061", 

31 "TIGR00064", 

32 "TIGR00065", 

33 "TIGR00082", 

34 "TIGR00083", 

35 "TIGR00084", 

36 "TIGR00086", 

37 "TIGR00088", 

38 "TIGR00090", 

39 "TIGR00092", 

40 "TIGR00095", 

41 "TIGR00115", 

42 "TIGR00116", 

43 "TIGR00138", 

44 "TIGR00158", 

45 "TIGR00166", 

46 "TIGR00168", 

47 "TIGR00186", 

48 "TIGR00194", 

49 "TIGR00250", 

50 "TIGR00337", 

51 "TIGR00344", 

52 "TIGR00362", 

53 "TIGR00382", 

54 "TIGR00392", 

55 "TIGR00396", 

56 "TIGR00398", 

57 "TIGR00414", 

58 "TIGR00416", 

59 "TIGR00420", 

60 "TIGR00431", 

61 "TIGR00435", 

62 "TIGR00436", 

63 "TIGR00442", 

64 "TIGR00445", 

65 "TIGR00456", 

66 "TIGR00459", 

67 "TIGR00460", 

68 "TIGR00468", 

69 "TIGR00472", 

70 "TIGR00487", 

71 "TIGR00496", 

72 "TIGR00539", 

73 "TIGR00580", 

74 "TIGR00593", 

75 "TIGR00615", 

76 "TIGR00631", 

77 "TIGR00634", 

78 "TIGR00635", 

79 "TIGR00643", 

80 "TIGR00663", 

81 "TIGR00717", 

82 "TIGR00755", 

83 "TIGR00810", 

84 "TIGR00922", 

85 "TIGR00928", 

86 "TIGR00959", 

87 "TIGR00963", 

88 "TIGR00964", 

89 "TIGR00967", 

90 "TIGR01009", 

91 "TIGR01011", 

92 "TIGR01017", 

93 "TIGR01021", 

94 "TIGR01029", 

95 "TIGR01032", 

96 "TIGR01039", 

97 "TIGR01044", 

98 "TIGR01059", 

99 "TIGR01063", 

100 "TIGR01066", 

101 "TIGR01071", 

102 "TIGR01079", 

103 "TIGR01082", 

104 "TIGR01087", 

105 "TIGR01128", 

106 "TIGR01146", 

107 "TIGR01164", 

108 "TIGR01169", 

109 "TIGR01171", 

110 "TIGR01302", 

111 "TIGR01391", 

112 "TIGR01393", 

113 "TIGR01394", 

114 "TIGR01510", 

115 "TIGR01632", 

116 "TIGR01951", 

117 "TIGR01953", 

118 "TIGR02012", 

119 "TIGR02013", 

120 "TIGR02027", 

121 "TIGR02075", 

122 "TIGR02191", 

123 "TIGR02273", 

124 "TIGR02350", 

125 "TIGR02386", 

126 "TIGR02397", 

127 "TIGR02432", 

128 "TIGR02729", 

129 "TIGR03263", 

130 "TIGR03594", 

131 "TIGR03625", 

132 "TIGR03632", 

133 "TIGR03654", 

134 "TIGR03723", 

135 "TIGR03725", 

136 "TIGR03953", 

137] 

138 

139AR53_MARKERS = [ 

140 "PF04919.13", 

141 "PF07541.13", 

142 "PF01000.27", 

143 "PF00687.22", 

144 "PF00466.21", 

145 "PF00827.18", 

146 "PF01280.21", 

147 "PF01090.20", 

148 "PF01200.19", 

149 "PF01015.19", 

150 "PF00900.21", 

151 "PF00410.20", 

152 "TIGR00037", 

153 "TIGR00064", 

154 "TIGR00111", 

155 "TIGR00134", 

156 "TIGR00279", 

157 "TIGR00291", 

158 "TIGR00323", 

159 "TIGR00335", 

160 "TIGR00373", 

161 "TIGR00405", 

162 "TIGR00448", 

163 "TIGR00483", 

164 "TIGR00491", 

165 "TIGR00522", 

166 "TIGR00967", 

167 "TIGR00982", 

168 "TIGR01008", 

169 "TIGR01012", 

170 "TIGR01018", 

171 "TIGR01020", 

172 "TIGR01028", 

173 "TIGR01046", 

174 "TIGR01052", 

175 "TIGR01171", 

176 "TIGR01213", 

177 "TIGR01952", 

178 "TIGR02236", 

179 "TIGR02338", 

180 "TIGR02389", 

181 "TIGR02390", 

182 "TIGR03626", 

183 "TIGR03627", 

184 "TIGR03628", 

185 "TIGR03629", 

186 "TIGR03670", 

187 "TIGR03671", 

188 "TIGR03672", 

189 "TIGR03673", 

190 "TIGR03674", 

191 "TIGR03676", 

192 "TIGR03680", 

193] 

194 

195 

196def read_fasta(path: str) -> Dict[str, str]: 

197 """ 

198 Read a FASTA file into a dictionary of sequences. 

199 

200 Parameters 

201 ---------- 

202 path : str 

203 Filesystem path to the input FASTA file. 

204 

205 Returns 

206 ------- 

207 Dict[str, str] 

208 Mapping from sequence ID (the first token after ‘>’ in the header) 

209 to the full sequence string, with any terminal “*” characters stripped. 

210 

211 Raises 

212 ------ 

213 IOError 

214 If the file cannot be opened for reading. 

215 """ 

216 seqs: Dict[str, str] = {} 

217 with open(path) as fh: 

218 header, buffer = None, [] 

219 for line in fh: 

220 line = line.rstrip() 

221 if not line: 

222 continue 

223 if line.startswith(">"): 

224 if header: 

225 seqs[header] = "".join(buffer).strip("*") 

226 header = line[1:].split()[0] 

227 buffer = [] 

228 else: 

229 buffer.append(line) 

230 if header: 

231 seqs[header] = "".join(buffer).strip("*") 

232 return seqs 

233 

234 

235def run_prodigal( 

236 genome_id: str, fasta_path: str, out_dir: str, force: bool = False 

237) -> str: 

238 """ 

239 Run Prodigal to predict protein translations from a genome FASTA. 

240 

241 Parameters 

242 ---------- 

243 genome_id : str 

244 Identifier for this genome; used to name output files. 

245 fasta_path : str 

246 Path to the input genome FASTA file. 

247 out_dir : str 

248 Directory under which a subdirectory named `genome_id` will be created 

249 (if necessary) to hold the predicted proteins. 

250 force : bool, optional 

251 If True and the output file already exists, it will be removed and 

252 regenerated. Default is False. 

253 

254 Returns 

255 ------- 

256 str 

257 Path to the predicted protein FASTA (`.faa`) file. 

258 

259 Raises 

260 ------ 

261 RuntimeError 

262 If the `prodigal` executable cannot be found in `PATH`. 

263 subprocess.CalledProcessError 

264 If Prodigal exits with a non-zero status. 

265 """ 

266 prot_dir = os.path.join(out_dir, genome_id) 

267 os.makedirs(prot_dir, exist_ok=True) 

268 prot_fa = os.path.join(prot_dir, f"{genome_id}.faa") 

269 

270 if force and os.path.exists(prot_fa): 

271 os.remove(prot_fa) 

272 

273 # check if gzipped 

274 if fasta_path.endswith(".gz"): 

275 with gzip.open(fasta_path, "rt") as gz_in, tempfile.NamedTemporaryFile( 

276 mode="w+", delete=False, suffix=".fasta" 

277 ) as tmp_fa: 

278 shutil.copyfileobj(gz_in, tmp_fa) 

279 tmp_fa_path = tmp_fa.name 

280 input_path = tmp_fa_path 

281 else: 

282 input_path = fasta_path 

283 subprocess.run( 

284 ["prodigal", "-a", prot_fa, "-p", "meta", "-i", input_path], 

285 check=True, 

286 stdout=subprocess.DEVNULL, 

287 stderr=subprocess.DEVNULL, 

288 ) 

289 return prot_fa 

290 

291 

292def parse_domtblout_top_hits(domtbl_path: str) -> Dict[str, List[str]]: 

293 """ 

294 Parse a HMMER --domtblout file and select the top hit per sequence. 

295 

296 Implements the GTDB-Tk comparator logic: for each query sequence, 

297 keeps the hit with highest bitscore; ties broken by lower e-value, 

298 then by lexicographically smaller HMM ID. 

299 

300 Parameters 

301 ---------- 

302 domtbl_path : str 

303 Path to the HMMER --domtblout output file. 

304 

305 Returns 

306 ------- 

307 Dict[str, List[str]] 

308 Mapping from HMM ID to a list of sequence IDs that were chosen 

309 as top hit(s) for that HMM. 

310 

311 Raises 

312 ------ 

313 IOError 

314 If the domtblout file cannot be opened. 

315 ValueError 

316 If a non-numeric e-value or bitscore is encountered. 

317 """ 

318 seq_matches: Dict[str, Tuple[str, float, float]] = {} 

319 with open(domtbl_path) as fh: 

320 for line in fh: 

321 if line.startswith("#"): 

322 continue 

323 parts = line.split() 

324 seq_id = parts[0] 

325 hmm_id = parts[3] 

326 evalue = float(parts[4]) 

327 bitscore = float(parts[5]) 

328 # look for the best hit for this sequence 

329 prev = seq_matches.get(seq_id) 

330 if prev is None: 

331 seq_matches[seq_id] = (hmm_id, bitscore, evalue) 

332 else: 

333 # only keep the best hit 

334 prev_hmm_id, prev_b, prev_e = prev 

335 if ( 

336 bitscore > prev_b 

337 or (bitscore == prev_b and evalue < prev_e) 

338 or ( 

339 bitscore == prev_b and evalue == prev_e and hmm_id < prev_hmm_id 

340 ) 

341 ): 

342 seq_matches[seq_id] = (hmm_id, bitscore, evalue) 

343 

344 # now, invert the mapping to get HMM IDs to sequences 

345 hits: Dict[str, List[str]] = {} 

346 for seq_id, (hmm_id, _, _) in seq_matches.items(): 

347 hits.setdefault(hmm_id, []).append(seq_id) 

348 return hits 

349 

350 

351def extract_single_copy_markers( 

352 genomes: Dict[str, str], 

353 out_dir: str, 

354 cpus: int = 1, 

355 pfam_db: str = os.environ.get("PFAM_HMMDB"), 

356 tigr_db: str = os.environ.get("TIGR_HMMDB"), 

357 force: bool = False, 

358) -> Dict[str, Dict[str, List[str]]]: 

359 

360 out_fastas = { gid: {"bac120": [], "ar53": []} for gid in genomes } 

361 total = len(genomes) 

362 

363 with Progress( 

364 TextColumn("[cyan]{task.description}"), 

365 BarColumn(), 

366 TaskProgressColumn(), 

367 TimeRemainingColumn(), 

368 ) as progress: 

369 task_pred = progress.add_task("Prodigal", total=total) 

370 task_pfam = progress.add_task("Pfam HMMs", total=total) 

371 task_tigr = progress.add_task("TIGRFAM HMMs", total=total) 

372 task_fasta = progress.add_task("Writing FASTAs", total=total) 

373 

374 for gid, path in genomes.items(): 

375 progress.console.print(f"Processing genome {gid} from {path}") 

376 # 1) Prodigal 

377 prot_fa = run_prodigal(gid, path, out_dir, force=force) 

378 prot_seqs = read_fasta(prot_fa) 

379 progress.update(task_pred, advance=1) 

380 

381 # 2) Pfam search 

382 pf_out = os.path.join(out_dir, gid, "pfam.tblout") 

383 if force and os.path.exists(pf_out): os.remove(pf_out) 

384 subprocess.run( 

385 ["hmmsearch","--cpu",str(cpus), 

386 "--notextw","-E","0.001","--domE","0.001", 

387 "--tblout",pf_out, pfam_db, prot_fa], 

388 check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, 

389 ) 

390 progress.update(task_pfam, advance=1) 

391 

392 # 3) TIGRFAM search 

393 tg_out = os.path.join(out_dir, gid, "tigrfam.tblout") 

394 if force and os.path.exists(tg_out): os.remove(tg_out) 

395 subprocess.run( 

396 ["hmmsearch","--cpu",str(cpus), 

397 "--noali","--notextw","--cut_nc", 

398 "--tblout",tg_out, tigr_db, prot_fa], 

399 check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, 

400 ) 

401 progress.update(task_tigr, advance=1) 

402 

403 # 4) Parse & merge 

404 hits = parse_domtblout_top_hits(pf_out) | parse_domtblout_top_hits(tg_out) 

405 

406 # 5) Write out the sequences 

407 for marker, recs in hits.items(): 

408 if len(recs) == 0: 

409 # no hits 

410 continue 

411 elif len(recs) == 1: 

412 # unique hit 

413 seq = prot_seqs[recs[0]] 

414 else: 

415 # multiple hits 

416 # check if they’re all identical 

417 seqs = set(prot_seqs[s] for s in recs) 

418 if len(seqs) != 1: 

419 # not identical, so skip 

420 continue 

421 # identical, so count it as a unique hit 

422 seq = seqs[0] 

423 

424 for dom in ("bac120", "ar53"): 

425 # write out the sequences to separate directories for each domain  

426 if (dom == "bac120" and marker in BAC120_MARKERS) or \ 

427 (dom == "ar53" and marker in AR53_MARKERS): 

428 genome_dir = os.path.join(out_dir, gid, dom) 

429 os.makedirs(genome_dir, exist_ok=True) 

430 fa_path = os.path.join(genome_dir, f"{marker}.fa") 

431 with open(fa_path, "w") as fh: 

432 fh.write(f">{gid}\n{seq}\n") 

433 out_fastas[gid][dom].append(fa_path) 

434 

435 progress.update(task_fasta, advance=1) 

436 

437 return out_fastas