Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import csv 

2from pathlib import Path 

3 

4 

5def read_tophits(path:Path|str) -> dict[str,str]: 

6 # Initialize an empty dictionary to store the data 

7 gene_family_dict = {} 

8 

9 # Read the TSV file 

10 with open(path, 'r') as file: 

11 reader = csv.DictReader(file, delimiter='\t') 

12 for row in reader: 

13 # Extract the Gene Id and Top hits (Family id,e-value,bitscore) 

14 gene_id = row['Gene Id'] 

15 top_hits = row['Top hits (Family id,e-value,bitscore)'] 

16 

17 # Split the top_hits to get the Family id 

18 family_id = top_hits.split(',')[0] 

19 

20 # Add to the dictionary 

21 gene_family_dict[gene_id] = family_id 

22 

23 return gene_family_dict 

24 

25 

26def read_tigrfam(file_path:Path|str) -> dict[str,str]: 

27 # Initialize an empty dictionary to store the data 

28 gene_family_dict = {} 

29 

30 with open(file_path, 'r') as file: 

31 lines = file.readlines() 

32 

33 # Find the header line with the actual column names 

34 header_line = "" 

35 for line in lines: 

36 if line.startswith('#') and 'target name' in line: 

37 header_line = line.strip('# \n') 

38 break 

39 

40 # Re-open the file to use the CSV DictReader from the correct position 

41 with open(file_path, 'r') as file: 

42 # Skip lines until we find the header line 

43 while True: 

44 line = file.readline() 

45 if header_line in line: 

46 break 

47 

48 # Read the TSV data starting from the header line 

49 reader = csv.DictReader(file, delimiter='\t', fieldnames=header_line.split()) 

50 next(reader) # Skip the header row itself 

51 for row in reader: 

52 # Extract the target name and query name 

53 gene_id = row['target name'].strip() 

54 family_id = row['query name'].strip() 

55 

56 # Add to the dictionary 

57 gene_family_dict[gene_id] = family_id 

58 

59 return gene_family_dict 

60 

61 

62def read_pfam(file_path:Path|str) -> dict[str,str]: 

63 # Initialize an empty dictionary to store the data 

64 gene_family_dict = {} 

65 

66 with open(file_path, 'r') as file: 

67 lines = file.readlines() 

68 

69 # Find the header line with the actual column names 

70 for i, line in enumerate(lines): 

71 if line.startswith('#') and '<seq id>' in line: 

72 header_index = i 

73 break 

74 

75 # Read the TSV data starting from the header line 

76 reader = csv.DictReader(lines[header_index + 1:], delimiter='\t', 

77 fieldnames=["seq_id", "alignment_start", "alignment_end", "envelope_start", "envelope_end", "hmm_acc", "hmm_name", "type", "hmm_start", "hmm_end", "hmm_length", "bit_score", "e_value", "significance", "clan"]) 

78 for row in reader: 

79 # Extract the seq id and hmm acc 

80 gene_id = row['seq_id'].strip() 

81 family_id = row['hmm_acc'].strip() 

82 

83 # Add to the dictionary 

84 gene_family_dict[gene_id] = family_id 

85 

86 return gene_family_dict