Coverage for bloodhound/gtdbtk.py: 0.00%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1import csv

2from pathlib import Path

5def read_tophits(path:Path|str) -> dict[str,str]:

6 # Initialize an empty dictionary to store the data

7 gene_family_dict = {}

9 # Read the TSV file

10 with open(path, 'r') as file:

11 reader = csv.DictReader(file, delimiter='\t')

12 for row in reader:

13 # Extract the Gene Id and Top hits (Family id,e-value,bitscore)

14 gene_id = row['Gene Id']

15 top_hits = row['Top hits (Family id,e-value,bitscore)']

17 # Split the top_hits to get the Family id

18 family_id = top_hits.split(',')[0]

20 # Add to the dictionary

21 gene_family_dict[gene_id] = family_id

23 return gene_family_dict

26def read_tigrfam(file_path:Path|str) -> dict[str,str]:

27 # Initialize an empty dictionary to store the data

28 gene_family_dict = {}

30 with open(file_path, 'r') as file:

31 lines = file.readlines()

33 # Find the header line with the actual column names

34 header_line = ""

35 for line in lines:

36 if line.startswith('#') and 'target name' in line:

37 header_line = line.strip('# \n')

38 break

40 # Re-open the file to use the CSV DictReader from the correct position

41 with open(file_path, 'r') as file:

42 # Skip lines until we find the header line

43 while True:

44 line = file.readline()

45 if header_line in line:

46 break

48 # Read the TSV data starting from the header line

49 reader = csv.DictReader(file, delimiter='\t', fieldnames=header_line.split())

50 next(reader) # Skip the header row itself

51 for row in reader:

52 # Extract the target name and query name

53 gene_id = row['target name'].strip()

54 family_id = row['query name'].strip()

56 # Add to the dictionary

57 gene_family_dict[gene_id] = family_id

59 return gene_family_dict

62def read_pfam(file_path:Path|str) -> dict[str,str]:

63 # Initialize an empty dictionary to store the data

64 gene_family_dict = {}

66 with open(file_path, 'r') as file:

67 lines = file.readlines()

69 # Find the header line with the actual column names

70 for i, line in enumerate(lines):

71 if line.startswith('#') and '<seq id>' in line:

72 header_index = i

73 break

75 # Read the TSV data starting from the header line

76 reader = csv.DictReader(lines[header_index + 1:], delimiter='\t',

77 fieldnames=["seq_id", "alignment_start", "alignment_end", "envelope_start", "envelope_end", "hmm_acc", "hmm_name", "type", "hmm_start", "hmm_end", "hmm_length", "bit_score", "e_value", "significance", "clan"])

78 for row in reader:

79 # Extract the seq id and hmm acc

80 gene_id = row['seq_id'].strip()

81 family_id = row['hmm_acc'].strip()

83 # Add to the dictionary

84 gene_family_dict[gene_id] = family_id

86 return gene_family_dict

Coverage for bloodhound/gtdbtk.py : 0.00%

47 statements

Coverage for bloodhound/gtdbtk.py : 0.00%

47 statements 0 run 47 missing 0 excluded

47 statements