-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathdesign_process_example.py
More file actions
79 lines (56 loc) · 1.8 KB
/
Copy pathdesign_process_example.py
File metadata and controls
79 lines (56 loc) · 1.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import sys
def compute_gc(seq):
"""
Computes the GC content of a sequence
Parameters:
seq (str): A DNA sequence
Returns:
gc_content (float): The frequency of GCs in the sequence
"""
# Initialize variable to store GC frequency
gc_count = 0
# Count number of GC
for base in seq:
if base in ['G', 'C']:
gc_count += 1
# Normalize
gc_content = gc_count/len(seq)
return gc_content
def get_sequences(filename):
"""
Returns the sequences in a fasta file
Parameters:
filename (str): Name of the fasta file
Returns:
sequences (dict): Dictionary of sequences keyed by fasta record
"""
# Initialize empty dict to store sequences
sequences = {}
# Read fasta file and store in dictionary
with open(filename, 'r') as fasta:
record_name = ''
for line in fasta:
if line.startswith('>'):
print('record name: ', line)
record_name = line.strip()
sequences[record_name] = ''
else:
sequence = line.strip()
sequences[record_name] += sequence
return sequences
def main():
"""Prints out GC-contents of all sequences in fasta file"""
# Get the filename from the command line argument
filename = sys.argv[1]
sequences = get_sequences(filename)
# Initialize dictionary to store GC contents
gc_contents = {}
# Iterate through sequences and compute GC content for each, then store in
# dictionary
for record_name in sequences:
sequence = sequences[record_name]
gc_content = compute_gc(sequence)
gc_contents[record_name] = gc_content
print(gc_contents)
if __name__ == "__main__":
main()