Bioinformatics_programs/dna.py at main · Sabarish2001/Bioinformatics_programs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# Function that will return the length of the given sequence
def sequence_length(sequence):
    check_length = len(sequence)
    return check_length

#Function that will count the number of nucleotides in a given sequnece and return the counts

def count_bases(sequences):
    base_dict = {}

    for bases in sequences:
        base_dict[bases] = 0
    for letter in sequences:
        base_dict[letter] += 1
    return base_dict

res = count_bases("ATGCATTTGGGGCCCAA")
print(res)

#Function that validates wether the given sequence is DNA sequence or not

def validate_nucleotides(sequence):
    nucleotides = ["A","T","G","C"]

    for base in sequence:
        if base not in nucleotides:
            return ("Nucleotide error {}".format(base))
    return sequence

res = validate_nucleotides("ATGC")
print(res)

#Function that will mimic the replication process in DNA (i.e returns its complementary bases)

def repliacte(sequence):
    complementary_bases = {"A":"T","G":"C","C":"G","T":"A"}
    repliacted = ""
    for base in sequence:
        transcribed_one += complementary_bases[base]
    return repliacted

res = repliacte("ATGCATGC")
print(res)

#Function that will transcribe DNA -> mRNA

def transcribe(sequence):
    complementary_base_pairs = {"A":"U","T":"A","G":"C","C":"G"}
    transcribed = ""
    liste = [complementary_base_pairs[bases] for bases in validate_nucleotides(sequence)]
    transcribed = transcribed.join(liste)
    return transcribed

#   return liste
res1 = transcribe("ATGC")
print(res1)

#Function that will return the number of matches and mismatches in a given sequence of equal sizes

def match_mismatch(sequence_1,sequence_2):

    match = 0
    mismatch = 0

    if len(sequence_1) == len(sequence_2):
        for base in range(len(sequence_1)):
            if sequence_1[base] == sequence_2[base]:
                 match += 1
            else:
                mismatch += 1

    return (match,mismatch)


x = match_mismatch(sequence_1="ATGCATGC",sequence_2="TTGCGTGC")
print("Matches = %s , Mismatches = %s" %(x))


#Function that will give you the AT composition in the given sequence

def AT_composition(sequence):
    count_AT = (((sequence.count('A') + sequence.count('T')) / len(sequence)) * 100)
    return count_AT

#Function that will give you the GC compositon in the given sequence


def GC_composition(sequence):
    count_GC = (((sequence.count('G') + sequence.count('C')) / len(sequence)) * 100)
    return count_GC

#Function that will return you the number of times AT pairs occurs in a given sequence

def AT_pairs(sequence,pattern = "AT"):

    seq_len = len(sequence)
    pattern_len = len(pattern)
    occurences = []
    res = [occurences.append(i) for i in range(0,seq_len-1) if sequence[i: i + pattern_len] == pattern]
    return occurences

                    #or

#    for i in range(0,(seq_len-1)):
#        if sequence[i: i + pattern_len] == pattern:
#            occurences.append(i)
#    return occurences


#Function that will give you the number of times GC pairs occurs in a given sequence

def GC_pairs(sequence,pattern="GC"):
    seq_len = len(sequence)
    pattern_len = len(pattern)
    gc_occurences = []

    for i in range(0,seq_len):
        if sequence[i : i + pattern_len] == pattern:
            gc_occurences.append(i)
    return gc_occurences

#Function Count will count the number of times a k-mer pattern appears as a substring of text

def count(text,pattern):
    text_len = len(text)
    pattern_len = len(pattern)
    count = 0
    for i in range(text_len - pattern_len + 1):
        if text[i:i + pattern_len] == pattern:
            count = count + 1
    return count

res = count("","")
print(res)


def validate_protein(protein):
    start_codon = "M"
    stop_codons = "Stop"

    if protein.startswith("M") and protein.endswith("Stop"):
        return protein
    else:
        return "Start codon error: {0}\n\t Stop codon error: {1}".format(start_codon,stop_codons)

def reading_frame(sequence,ORF,codon=3):
    sequence_len = len(sequence)
    protein = ""

    for i in range(ORF,sequence_len - codon + 1, codon):
        if sequence[i : i + codon] in codons:
            triplet = sequence[i : i + codon]
            protein += codons[triplet]
        else:
            break
    return validate_protein(protein)

res0 = reading_frame(sequence="AUGCGAUGA",ORF=0)
print("For ORF = 0 : %s" %(res0))
res1 = reading_frame(sequence="CGAGCCUAA",ORF=1)
print("For ORF = 1 : %s" %(res1))
res2 = reading_frame(sequence="CGAGCCUAG",ORF=2)
print("For ORF = 2 : %s" %(res2))

def count_gaps(seq1,seq2):

    len_seq1 = len(seq1)
    len_seq2 = len(seq2)
    gaps = " "
    seq1_total_gaps = 0
    seq2_total_gaps = 0
    for i in range(len_seq1):
        if seq1[i] == gaps:
            seq1_total_gaps += 1

    for j in range(len_seq2):
        if seq2[j] == gaps:
            seq2_total_gaps +=1


    return seq1_total_gaps,seq2_total_gaps

gap1,gap2 = count_gaps("ATGCATGC","ACG TAG ATGC")
print("Gaps in seq1 = %d" %gap1,"\n" "Gaps in seq2 = %d" %gap2)


def common_substring(seq1,k_mer):
    seq1_len = len(seq1)
    possible_kmer_patterns = {}
    pattern_occurences = []

    for i in range(0,seq1_len-k_mer + 1):
        patterns = seq1[i : i + k_mer]
        if not patterns in possible_kmer_patterns:
            possible_kmer_patterns[patterns] = 0
        possible_kmer_patterns[patterns] += 1
    return possible_kmer_patterns

def count_common_pattern(res):
    biggest = 0
    for key,values in res.items():
        if values > biggest:
            biggest = values
            current_key = key
    return biggest,current_key

num,pat = count_common_pattern(common_substring("ATGCTGTGATGC",1))
print("Common substring is: %s" %(pat), "and number of times it occurs is: %d"%(num))


def mapping_patterns(pattern,text):
    text_len = len(text)
    patt_len = len(pattern)

    patt_occs = []
    for i in range(0, text_len - patt_len + 1):
        if text[i : i + patt_len] == pattern:
            patt_occs.append(i)
    return patt_occs

my_text = input("Enter the text")
patterns = int(input("How many patterns do you want to match with the text:"))

pattern_list = []
for i in range(0,patterns):
    pattern_list.append(input("Enter the pattern:"))
    for j in range(0,len(pattern_list)):
        res = mapping_patterns(pattern_list[j],my_text)
    print(res)


#count_codons will count the number of codons present in a given sequence

def count_codons(sequence, codon=3):
    seq_len = len(sequence)

    count = 0
    for each_triplet in range(0, seq_len,codon):
        if sequence[each_triplet : each_triplet + codon]:
            count += 1
    return count

res = count_codons("ATTATGCCGGCCACC")
print(res)


def count_CpG(sequence,pattern='CG'):
    seq_len = len(sequence)
    pat_len = len(pattern)

    count = 0
    for i in range(0, seq_len - pat_len):
        if sequence[i : i + pat_len] == pattern:
            count += 1
    return count

res = count_CpG("ATTGCGCGCGGGCGATTACGGCG")
print("Number of CpGs present in the sequence is : %d" %res)