Coverage for bloodhound/markers.py: 95.10%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1import os

2import subprocess

3from typing import Dict, List, Tuple

4import gzip

5import shutil

6import tempfile

7from rich.progress import (

8 Progress,

9 TextColumn,

10 BarColumn,

11 TaskProgressColumn,

12 TimeRemainingColumn,

13)

15# Marker information

16BAC120_MARKERS = [

17 "PF00380.20",

18 "PF00410.20",

19 "PF00466.21",

20 "PF01025.20",

21 "PF02576.18",

22 "PF03726.15",

23 "TIGR00006",

24 "TIGR00019",

25 "TIGR00020",

26 "TIGR00029",

27 "TIGR00043",

28 "TIGR00054",

29 "TIGR00059",

30 "TIGR00061",

31 "TIGR00064",

32 "TIGR00065",

33 "TIGR00082",

34 "TIGR00083",

35 "TIGR00084",

36 "TIGR00086",

37 "TIGR00088",

38 "TIGR00090",

39 "TIGR00092",

40 "TIGR00095",

41 "TIGR00115",

42 "TIGR00116",

43 "TIGR00138",

44 "TIGR00158",

45 "TIGR00166",

46 "TIGR00168",

47 "TIGR00186",

48 "TIGR00194",

49 "TIGR00250",

50 "TIGR00337",

51 "TIGR00344",

52 "TIGR00362",

53 "TIGR00382",

54 "TIGR00392",

55 "TIGR00396",

56 "TIGR00398",

57 "TIGR00414",

58 "TIGR00416",

59 "TIGR00420",

60 "TIGR00431",

61 "TIGR00435",

62 "TIGR00436",

63 "TIGR00442",

64 "TIGR00445",

65 "TIGR00456",

66 "TIGR00459",

67 "TIGR00460",

68 "TIGR00468",

69 "TIGR00472",

70 "TIGR00487",

71 "TIGR00496",

72 "TIGR00539",

73 "TIGR00580",

74 "TIGR00593",

75 "TIGR00615",

76 "TIGR00631",

77 "TIGR00634",

78 "TIGR00635",

79 "TIGR00643",

80 "TIGR00663",

81 "TIGR00717",

82 "TIGR00755",

83 "TIGR00810",

84 "TIGR00922",

85 "TIGR00928",

86 "TIGR00959",

87 "TIGR00963",

88 "TIGR00964",

89 "TIGR00967",

90 "TIGR01009",

91 "TIGR01011",

92 "TIGR01017",

93 "TIGR01021",

94 "TIGR01029",

95 "TIGR01032",

96 "TIGR01039",

97 "TIGR01044",

98 "TIGR01059",

99 "TIGR01063",

100 "TIGR01066",

101 "TIGR01071",

102 "TIGR01079",

103 "TIGR01082",

104 "TIGR01087",

105 "TIGR01128",

106 "TIGR01146",

107 "TIGR01164",

108 "TIGR01169",

109 "TIGR01171",

110 "TIGR01302",

111 "TIGR01391",

112 "TIGR01393",

113 "TIGR01394",

114 "TIGR01510",

115 "TIGR01632",

116 "TIGR01951",

117 "TIGR01953",

118 "TIGR02012",

119 "TIGR02013",

120 "TIGR02027",

121 "TIGR02075",

122 "TIGR02191",

123 "TIGR02273",

124 "TIGR02350",

125 "TIGR02386",

126 "TIGR02397",

127 "TIGR02432",

128 "TIGR02729",

129 "TIGR03263",

130 "TIGR03594",

131 "TIGR03625",

132 "TIGR03632",

133 "TIGR03654",

134 "TIGR03723",

135 "TIGR03725",

136 "TIGR03953",

137]

138

139AR53_MARKERS = [

140 "PF04919.13",

141 "PF07541.13",

142 "PF01000.27",

143 "PF00687.22",

144 "PF00466.21",

145 "PF00827.18",

146 "PF01280.21",

147 "PF01090.20",

148 "PF01200.19",

149 "PF01015.19",

150 "PF00900.21",

151 "PF00410.20",

152 "TIGR00037",

153 "TIGR00064",

154 "TIGR00111",

155 "TIGR00134",

156 "TIGR00279",

157 "TIGR00291",

158 "TIGR00323",

159 "TIGR00335",

160 "TIGR00373",

161 "TIGR00405",

162 "TIGR00448",

163 "TIGR00483",

164 "TIGR00491",

165 "TIGR00522",

166 "TIGR00967",

167 "TIGR00982",

168 "TIGR01008",

169 "TIGR01012",

170 "TIGR01018",

171 "TIGR01020",

172 "TIGR01028",

173 "TIGR01046",

174 "TIGR01052",

175 "TIGR01171",

176 "TIGR01213",

177 "TIGR01952",

178 "TIGR02236",

179 "TIGR02338",

180 "TIGR02389",

181 "TIGR02390",

182 "TIGR03626",

183 "TIGR03627",

184 "TIGR03628",

185 "TIGR03629",

186 "TIGR03670",

187 "TIGR03671",

188 "TIGR03672",

189 "TIGR03673",

190 "TIGR03674",

191 "TIGR03676",

192 "TIGR03680",

193]

194

195

196def read_fasta(path: str) -> Dict[str, str]:

197 """

198 Read a FASTA file into a dictionary of sequences.

199

200 Parameters

201 ----------

202 path : str

203 Filesystem path to the input FASTA file.

204

205 Returns

206 -------

207 Dict[str, str]

208 Mapping from sequence ID (the first token after ‘>’ in the header)

209 to the full sequence string, with any terminal “*” characters stripped.

210

211 Raises

212 ------

213 IOError

214 If the file cannot be opened for reading.

215 """

216 seqs: Dict[str, str] = {}

217 with open(path) as fh:

218 header, buffer = None, []

219 for line in fh:

220 line = line.rstrip()

221 if not line:

222 continue

223 if line.startswith(">"):

224 if header:

225 seqs[header] = "".join(buffer).strip("*")

226 header = line[1:].split()[0]

227 buffer = []

228 else:

229 buffer.append(line)

230 if header:

231 seqs[header] = "".join(buffer).strip("*")

232 return seqs

233

234

235def run_prodigal(

236 genome_id: str, fasta_path: str, out_dir: str, force: bool = False

237) -> str:

238 """

239 Run Prodigal to predict protein translations from a genome FASTA.

240

241 Parameters

242 ----------

243 genome_id : str

244 Identifier for this genome; used to name output files.

245 fasta_path : str

246 Path to the input genome FASTA file.

247 out_dir : str

248 Directory under which a subdirectory named `genome_id` will be created

249 (if necessary) to hold the predicted proteins.

250 force : bool, optional

251 If True and the output file already exists, it will be removed and

252 regenerated. Default is False.

253

254 Returns

255 -------

256 str

257 Path to the predicted protein FASTA (`.faa`) file.

258

259 Raises

260 ------

261 RuntimeError

262 If the `prodigal` executable cannot be found in `PATH`.

263 subprocess.CalledProcessError

264 If Prodigal exits with a non-zero status.

265 """

266 prot_dir = os.path.join(out_dir, genome_id)

267 os.makedirs(prot_dir, exist_ok=True)

268 prot_fa = os.path.join(prot_dir, f"{genome_id}.faa")

269

270 if force and os.path.exists(prot_fa):

271 os.remove(prot_fa)

272

273 # check if gzipped

274 if fasta_path.endswith(".gz"):

275 with gzip.open(fasta_path, "rt") as gz_in, tempfile.NamedTemporaryFile(

276 mode="w+", delete=False, suffix=".fasta"

277 ) as tmp_fa:

278 shutil.copyfileobj(gz_in, tmp_fa)

279 tmp_fa_path = tmp_fa.name

280 input_path = tmp_fa_path

281 else:

282 input_path = fasta_path

283 subprocess.run(

284 ["prodigal", "-a", prot_fa, "-p", "meta", "-i", input_path],

285 check=True,

286 stdout=subprocess.DEVNULL,

287 stderr=subprocess.DEVNULL,

288 )

289 return prot_fa

290

291

292def parse_domtblout_top_hits(domtbl_path: str) -> Dict[str, List[str]]:

293 """

294 Parse a HMMER --domtblout file and select the top hit per sequence.

295

296 Implements the GTDB-Tk comparator logic: for each query sequence,

297 keeps the hit with highest bitscore; ties broken by lower e-value,

298 then by lexicographically smaller HMM ID.

299

300 Parameters

301 ----------

302 domtbl_path : str

303 Path to the HMMER --domtblout output file.

304

305 Returns

306 -------

307 Dict[str, List[str]]

308 Mapping from HMM ID to a list of sequence IDs that were chosen

309 as top hit(s) for that HMM.

310

311 Raises

312 ------

313 IOError

314 If the domtblout file cannot be opened.

315 ValueError

316 If a non-numeric e-value or bitscore is encountered.

317 """

318 seq_matches: Dict[str, Tuple[str, float, float]] = {}

319 with open(domtbl_path) as fh:

320 for line in fh:

321 if line.startswith("#"):

322 continue

323 parts = line.split()

324 seq_id = parts[0]

325 hmm_id = parts[3]

326 evalue = float(parts[4])

327 bitscore = float(parts[5])

328 # look for the best hit for this sequence

329 prev = seq_matches.get(seq_id)

330 if prev is None:

331 seq_matches[seq_id] = (hmm_id, bitscore, evalue)

332 else:

333 # only keep the best hit

334 prev_hmm_id, prev_b, prev_e = prev

335 if (

336 bitscore > prev_b

337 or (bitscore == prev_b and evalue < prev_e)

338 or (

339 bitscore == prev_b and evalue == prev_e and hmm_id < prev_hmm_id

340 )

341 ):

342 seq_matches[seq_id] = (hmm_id, bitscore, evalue)

343

344 # now, invert the mapping to get HMM IDs to sequences

345 hits: Dict[str, List[str]] = {}

346 for seq_id, (hmm_id, _, _) in seq_matches.items():

347 hits.setdefault(hmm_id, []).append(seq_id)

348 return hits

349

350

351def extract_single_copy_markers(

352 genomes: Dict[str, str],

353 out_dir: str,

354 cpus: int = 1,

355 pfam_db: str = os.environ.get("PFAM_HMMDB"),

356 tigr_db: str = os.environ.get("TIGR_HMMDB"),

357 force: bool = False,

358) -> Dict[str, Dict[str, List[str]]]:

359

360 out_fastas = { gid: {"bac120": [], "ar53": []} for gid in genomes }

361 total = len(genomes)

362

363 with Progress(

364 TextColumn("[cyan]{task.description}"),

365 BarColumn(),

366 TaskProgressColumn(),

367 TimeRemainingColumn(),

368 ) as progress:

369 task_pred = progress.add_task("Prodigal", total=total)

370 task_pfam = progress.add_task("Pfam HMMs", total=total)

371 task_tigr = progress.add_task("TIGRFAM HMMs", total=total)

372 task_fasta = progress.add_task("Writing FASTAs", total=total)

373

374 for gid, path in genomes.items():

375 progress.console.print(f"Processing genome {gid} from {path}")

376 # 1) Prodigal

377 prot_fa = run_prodigal(gid, path, out_dir, force=force)

378 prot_seqs = read_fasta(prot_fa)

379 progress.update(task_pred, advance=1)

380

381 # 2) Pfam search

382 pf_out = os.path.join(out_dir, gid, "pfam.tblout")

383 if force and os.path.exists(pf_out): os.remove(pf_out)

384 subprocess.run(

385 ["hmmsearch","--cpu",str(cpus),

386 "--notextw","-E","0.001","--domE","0.001",

387 "--tblout",pf_out, pfam_db, prot_fa],

388 check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,

389 )

390 progress.update(task_pfam, advance=1)

391

392 # 3) TIGRFAM search

393 tg_out = os.path.join(out_dir, gid, "tigrfam.tblout")

394 if force and os.path.exists(tg_out): os.remove(tg_out)

395 subprocess.run(

396 ["hmmsearch","--cpu",str(cpus),

397 "--noali","--notextw","--cut_nc",

398 "--tblout",tg_out, tigr_db, prot_fa],

399 check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,

400 )

401 progress.update(task_tigr, advance=1)

402

403 # 4) Parse & merge

404 hits = parse_domtblout_top_hits(pf_out) | parse_domtblout_top_hits(tg_out)

405

406 # 5) Write out the sequences

407 for marker, recs in hits.items():

408 if len(recs) == 0:

409 # no hits

410 continue

411 elif len(recs) == 1:

412 # unique hit

413 seq = prot_seqs[recs[0]]

414 else:

415 # multiple hits

416 # check if they’re all identical

417 seqs = set(prot_seqs[s] for s in recs)

418 if len(seqs) != 1:

419 # not identical, so skip

420 continue

421 # identical, so count it as a unique hit

422 seq = seqs[0]

423

424 for dom in ("bac120", "ar53"):

425 # write out the sequences to separate directories for each domain

426 if (dom == "bac120" and marker in BAC120_MARKERS) or \

427 (dom == "ar53" and marker in AR53_MARKERS):

428 genome_dir = os.path.join(out_dir, gid, dom)

429 os.makedirs(genome_dir, exist_ok=True)

430 fa_path = os.path.join(genome_dir, f"{marker}.fa")

431 with open(fa_path, "w") as fh:

432 fh.write(f">{gid}\n{seq}\n")

433 out_fastas[gid][dom].append(fa_path)

434

435 progress.update(task_fasta, advance=1)

436

437 return out_fastas

Coverage for bloodhound/markers.py : 95.10%

102 statements

Coverage for bloodhound/markers.py : 95.10%

102 statements 97 run 5 missing 0 excluded

102 statements