Coverage for bloodhound/gtdbtk.py : 0.00%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import csv
2from pathlib import Path
5def read_tophits(path:Path|str) -> dict[str,str]:
6 # Initialize an empty dictionary to store the data
7 gene_family_dict = {}
9 # Read the TSV file
10 with open(path, 'r') as file:
11 reader = csv.DictReader(file, delimiter='\t')
12 for row in reader:
13 # Extract the Gene Id and Top hits (Family id,e-value,bitscore)
14 gene_id = row['Gene Id']
15 top_hits = row['Top hits (Family id,e-value,bitscore)']
17 # Split the top_hits to get the Family id
18 family_id = top_hits.split(',')[0]
20 # Add to the dictionary
21 gene_family_dict[gene_id] = family_id
23 return gene_family_dict
26def read_tigrfam(file_path:Path|str) -> dict[str,str]:
27 # Initialize an empty dictionary to store the data
28 gene_family_dict = {}
30 with open(file_path, 'r') as file:
31 lines = file.readlines()
33 # Find the header line with the actual column names
34 header_line = ""
35 for line in lines:
36 if line.startswith('#') and 'target name' in line:
37 header_line = line.strip('# \n')
38 break
40 # Re-open the file to use the CSV DictReader from the correct position
41 with open(file_path, 'r') as file:
42 # Skip lines until we find the header line
43 while True:
44 line = file.readline()
45 if header_line in line:
46 break
48 # Read the TSV data starting from the header line
49 reader = csv.DictReader(file, delimiter='\t', fieldnames=header_line.split())
50 next(reader) # Skip the header row itself
51 for row in reader:
52 # Extract the target name and query name
53 gene_id = row['target name'].strip()
54 family_id = row['query name'].strip()
56 # Add to the dictionary
57 gene_family_dict[gene_id] = family_id
59 return gene_family_dict
62def read_pfam(file_path:Path|str) -> dict[str,str]:
63 # Initialize an empty dictionary to store the data
64 gene_family_dict = {}
66 with open(file_path, 'r') as file:
67 lines = file.readlines()
69 # Find the header line with the actual column names
70 for i, line in enumerate(lines):
71 if line.startswith('#') and '<seq id>' in line:
72 header_index = i
73 break
75 # Read the TSV data starting from the header line
76 reader = csv.DictReader(lines[header_index + 1:], delimiter='\t',
77 fieldnames=["seq_id", "alignment_start", "alignment_end", "envelope_start", "envelope_end", "hmm_acc", "hmm_name", "type", "hmm_start", "hmm_end", "hmm_length", "bit_score", "e_value", "significance", "clan"])
78 for row in reader:
79 # Extract the seq id and hmm acc
80 gene_id = row['seq_id'].strip()
81 family_id = row['hmm_acc'].strip()
83 # Add to the dictionary
84 gene_family_dict[gene_id] = family_id
86 return gene_family_dict