Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import re
2from builtins import max as builtin_max
3from typing import List, Optional, Union
5from Bio import AlignIO
6from Bio import SeqIO as SeqIO
7from Bio.SeqRecord import SeqRecord
9from ..utils import PhytestObject, assert_or_warn
12class Sequence(PhytestObject, SeqRecord):
13 @classmethod
14 def parse(cls, alignment_path, alignment_format) -> 'Sequence':
15 return (
16 Sequence(
17 r.seq,
18 id=r.id,
19 name=r.name,
20 description=r.description,
21 dbxrefs=r.dbxrefs,
22 features=r.features,
23 annotations=r.annotations,
24 letter_annotations=r.letter_annotations,
25 )
26 for r in SeqIO.parse(alignment_path, alignment_format)
27 )
29 def assert_valid_alphabet(self, alphabet: str = "ATCGN-", *, warning: bool = False) -> None:
30 """
31 Asserts that that the sequence only contains particular charaters.
33 Args:
34 alphabet (str): A string containing legal charaters. Defaults to 'ATCGN-'.
35 warning (bool): If True, raise a warning instead of an exception. Defaults to False.
36 This flag can be set by running this method with the prefix `warn_` instead of `assert_`.
37 """
38 regex_invalid = re.compile(f"[^{re.escape(alphabet)}]")
39 result = regex_invalid.search(str(self.seq))
40 if result:
41 assert_or_warn(
42 not result,
43 warning,
44 f"Invalid pattern found in '{self.id}'.",
45 f"Character '{result.group(0)}' at position {result.start(0)+1} found which is not in alphabet '{alphabet}'.",
46 )
48 def assert_length(
49 self,
50 length: Optional[int] = None,
51 *,
52 min: Optional[int] = None,
53 max: Optional[int] = None,
54 warning: bool = False,
55 ) -> None:
56 """
57 Asserts that that the sequence length meets the specified criteria.
59 Args:
60 length (int, optional): If set, then sequence length must be equal to this value. Defaults to None.
61 min (int, optional): If set, then sequence length must be equal to or greater than this value. Defaults to None.
62 max (int, optional): If set, then sequence length must be equal to or less than this value. Defaults to None.
63 warning (bool): If True, raise a warning instead of an exception. Defaults to False.
64 This flag can be set by running this method with the prefix `warn_` instead of `assert_`.
65 """
66 sequence_length = len(self.seq)
67 if length is not None:
68 assert_or_warn(
69 sequence_length == length,
70 warning,
71 f"Sequence length of '{self.id}' ({sequence_length}) is not equal to the required length of {length}.",
72 )
73 if min is not None:
74 assert_or_warn(
75 sequence_length >= min,
76 warning,
77 f"Sequence length of '{self.id}' ({sequence_length}) is less than the minimum {min}.",
78 )
79 if max is not None:
80 assert_or_warn(
81 sequence_length <= max,
82 warning,
83 f"Sequence length of '{self.id}' ({sequence_length}) is greater than the maximum {max}.",
84 )
86 def assert_count(
87 self,
88 pattern: str,
89 *,
90 count: Optional[int] = None,
91 min: Optional[int] = None,
92 max: Optional[int] = None,
93 warning: bool = False,
94 ) -> None:
95 """
96 Asserts that the count of a pattern in the sequence meets the specified criteria.
98 Args:
99 pattern: (str): the pattern to count in the the sequence.
100 count (int, optional): If set, then pattern count must be equal to this value. Defaults to None.
101 min (int, optional): If set, then pattern count must be equal to or greater than this value. Defaults to None.
102 max (int, optional): If set, then pattern count must be equal to or less than this value. Defaults to None.
103 warning (bool): If True, raise a warning instead of an exception. Defaults to False.
104 This flag can be set by running this method with the prefix `warn_` instead of `assert_`.
105 """
106 base_count = self.seq.count(pattern)
107 summary = f"Sequence '{self.id}' matches pattern '{pattern}' {base_count} time(s)."
108 if count is not None:
109 assert_or_warn(
110 base_count == count,
111 warning,
112 summary,
113 f"This is not equal to the required number of {count}.",
114 )
115 if min is not None:
116 assert_or_warn(
117 base_count >= min,
118 warning,
119 summary,
120 f"This is less than the minimum {min}.",
121 )
122 if max is not None:
123 assert_or_warn(
124 base_count <= max,
125 warning,
126 summary,
127 f"This is greater than the maximum {max}.",
128 )
130 def assert_percent(
131 self,
132 nucleotide: Union[str, List[str]],
133 *,
134 percent: Optional[float] = None,
135 min: Optional[float] = None,
136 max: Optional[float] = None,
137 warning: bool = False,
138 ) -> None:
139 """
140 Asserts that the percentage of a nucleotide in the sequence meets the specified criteria.
142 Args:
143 nucleotide: (Union[str, List[str]]): The nucleotide(s) to count in the the sequence.
144 percent (float, optional): If set, then nucleotide percentage must be equal to this value. Defaults to None.
145 min (float, optional): If set, then nucleotide percentage must be equal to or greater than this value. Defaults to None.
146 max (float, optional): If set, then nucleotide percentage must be equal to or less than this value. Defaults to None.
147 warning (bool): If True, raise a warning instead of an exception. Defaults to False.
148 This flag can be set by running this method with the prefix `warn_` instead of `assert_`.
149 """
150 try:
151 if isinstance(nucleotide, str):
152 if len(nucleotide) > 1:
153 raise ValueError(
154 f"The length of the requested nucleotide '{nucleotide}' is more than a single character. "
155 "This value should either be a single character (i.e. A, G, C, T) or a list of single characters."
156 )
157 base_percent = (self.seq.count(nucleotide) * 100.0) / len(self.seq)
158 elif isinstance(nucleotide, list):
159 base_percent = (sum(self.seq.count(x) for x in nucleotide) * 100) / len(self.seq)
160 nucleotide = ', '.join(nucleotide)
161 else:
162 raise ValueError(f"Nucleotide must be str or list and cannot be of type '{type(nucleotide)}'.")
163 except ZeroDivisionError:
164 base_percent = 0.0
165 summary = f"Sequence '{self.id}' contains {base_percent} percent '{nucleotide}'."
166 if percent is not None:
167 assert_or_warn(
168 base_percent == percent,
169 warning,
170 summary,
171 f"This is not equal to the required percentage of {percent}.",
172 )
173 if min is not None:
174 assert_or_warn(
175 base_percent >= min,
176 warning,
177 summary,
178 f"This is less than the minimum {min}.",
179 )
180 if max is not None:
181 assert_or_warn(
182 base_percent <= max,
183 warning,
184 summary,
185 f"This is greater than the maximum {max}.",
186 )
188 def assert_percent_GC(
189 self,
190 percent: Optional[int] = None,
191 *,
192 min: Optional[int] = None,
193 max: Optional[int] = None,
194 warning: bool = False,
195 ) -> None:
196 """
197 Asserts that the percent of GC's (ambiguous nucleotide S) in the sequence meets the specified criteria.
199 Args:
200 percent (float, optional): If set, then the percentage of GC's must be equal to this value. Defaults to None.
201 min (float, optional): If set, then the percentage of GC's must be equal to or greater than this value. Defaults to None.
202 max (float, optional): If set, then the percentage of GC's must be equal to or less than this value. Defaults to None.
203 warning (bool): If True, raise a warning instead of an exception. Defaults to False.
204 This flag can be set by running this method with the prefix `warn_` instead of `assert_`.
205 """
206 self.assert_percent(
207 nucleotide=["G", "C", "g", "c", "S", "s"], percent=percent, min=min, max=max, warning=warning
208 )
210 def assert_percent_N(
211 self,
212 percent: Optional[int] = None,
213 *,
214 min: Optional[int] = None,
215 max: Optional[int] = None,
216 warning: bool = False,
217 ) -> None:
218 """
219 Asserts that the percent of N's in the sequence meets the specified criteria.
221 Args:
222 percent (float, optional): If set, then the percentage of N's must be equal to this value. Defaults to None.
223 min (float, optional): If set, then the percentage of N's must be equal to or greater than this value. Defaults to None.
224 max (float, optional): If set, then the percentage of N's must be equal to or less than this value. Defaults to None.
225 warning (bool): If True, raise a warning instead of an exception. Defaults to False.
226 This flag can be set by running this method with the prefix `warn_` instead of `assert_`.
227 """
228 self.assert_percent(nucleotide=["N", "n"], percent=percent, min=min, max=max, warning=warning)
230 def assert_percent_gaps(
231 self,
232 percent: Optional[int] = None,
233 *,
234 min: Optional[int] = None,
235 max: Optional[int] = None,
236 warning: bool = False,
237 ) -> None:
238 """
239 Asserts that the percent of gaps (-) in the sequence meets the specified criteria.
241 Args:
242 percent (float, optional): If set, then the percentage of gaps must be equal to this value. Defaults to None.
243 min (float, optional): If set, then the percentage of gaps must be equal to or greater than this value. Defaults to None.
244 max (float, optional): If set, then the percentage of gaps must be equal to or less than this value. Defaults to None.
245 warning (bool): If True, raise a warning instead of an exception. Defaults to False.
246 This flag can be set by running this method with the prefix `warn_` instead of `assert_`.
247 """
248 self.assert_percent(nucleotide='-', percent=percent, min=min, max=max, warning=warning)
250 def assert_count_Ns(
251 self,
252 count: Optional[int] = None,
253 *,
254 min: Optional[int] = None,
255 max: Optional[int] = None,
256 warning: bool = False,
257 ) -> None:
258 """
259 Asserts that the number of a N's in the sequence meets the specified criteria.
261 Args:
262 count (int, optional): If set, then the number of N's must be equal to this value. Defaults to None.
263 min (int, optional): If set, then the number of N's must be equal to or greater than this value. Defaults to None.
264 max (int, optional): If set, then the number of N's must be equal to or less than this value. Defaults to None.
265 warning (bool): If True, raise a warning instead of an exception. Defaults to False.
266 This flag can be set by running this method with the prefix `warn_` instead of `assert_`.
267 """
268 self.assert_count(pattern='N', count=count, min=min, max=max, warning=warning)
270 def assert_count_gaps(
271 self,
272 count: Optional[int] = None,
273 *,
274 min: Optional[int] = None,
275 max: Optional[int] = None,
276 warning: bool = False,
277 ) -> None:
278 """
279 Asserts that the number of a gaps (-) in the sequence meets the specified criteria.
281 Args:
282 count (int, optional): If set, then the number of gaps (-) must be equal to this value. Defaults to None.
283 min (int, optional): If set, then the number of gaps (-) must be equal to or greater than this value. Defaults to None.
284 max (int, optional): If set, then the number of gaps (-) must be equal to or less than this value. Defaults to None.
285 warning (bool): If True, raise a warning instead of an exception. Defaults to False.
286 This flag can be set by running this method with the prefix `warn_` instead of `assert_`.
287 """
288 self.assert_count(pattern='-', count=count, min=min, max=max, warning=warning)
290 def assert_longest_stretch(
291 self,
292 pattern: str,
293 *,
294 count: Optional[int] = None,
295 min: Optional[int] = None,
296 max: Optional[int] = None,
297 warning: bool = False,
298 ):
299 """
300 Asserts that the longest stretch of a pattern in the sequence meets the specified criteria.
302 e.g. the longest stretch of N's in 'ANNNANNA' is 3.
304 Args:
305 pattern: (str): the pattern to count in the the sequence.
306 count (int, optional): If set, then the longest stretch of the pattern must be equal to this value. Defaults to None.
307 min (int, optional): If set, then the longest stretch of the pattern must be equal to or greater than this value. Defaults to None.
308 max (int, optional): If set, then the longest stretch of the pattern must be equal to or less than this value. Defaults to None.
309 warning (bool): If True, raise a warning instead of an exception. Defaults to False.
310 This flag can be set by running this method with the prefix `warn_` instead of `assert_`.
311 """
312 matches = re.findall(f'{pattern}+', str(self.seq))
313 longest_stretch = len(builtin_max(matches)) if matches else 0
314 summary = f"The longest stretch of pattern '{pattern}' in sequence '{self.id}' is {longest_stretch}."
315 if count is not None:
316 assert_or_warn(
317 longest_stretch == count,
318 warning,
319 summary,
320 f"This is not equal to the required number of {count}.",
321 )
322 if min is not None:
323 assert_or_warn(
324 longest_stretch >= min,
325 warning,
326 summary,
327 f"This is less than the minimum {min}.",
328 )
329 if max is not None:
330 assert_or_warn(
331 longest_stretch <= max,
332 warning,
333 summary,
334 f"This is greater than the maximum {max}.",
335 )
337 def assert_longest_stretch_Ns(
338 self,
339 count: Optional[int] = None,
340 *,
341 min: Optional[int] = None,
342 max: Optional[int] = None,
343 warning: bool = False,
344 ):
345 """
346 Asserts that the longest stretch of a N's in the sequence meets the specified criteria.
348 e.g. the logest stretch of N's in 'ANNNANNA' is 3.
350 Args:
351 count (int, optional): If set, then the longest stretch of N's must be equal to this value. Defaults to None.
352 min (int, optional): If set, then the longest stretch of N's must be equal to or greater than this value. Defaults to None.
353 max (int, optional): If set, then the longest stretch of N's must be equal to or less than this value. Defaults to None.
354 warning (bool): If True, raise a warning instead of an exception. Defaults to False.
355 This flag can be set by running this method with the prefix `warn_` instead of `assert_`.
356 """
357 self.assert_longest_stretch(pattern='N', count=count, min=min, max=max, warning=warning)
359 def assert_longest_stretch_gaps(
360 self,
361 count: Optional[int] = None,
362 *,
363 min: Optional[int] = None,
364 max: Optional[int] = None,
365 warning: bool = False,
366 ):
367 """
368 Asserts that the longest stretch of a gaps (-) in the sequence meets the specified criteria.
370 e.g. the logest stretch of gaps (-) in 'A---A--A' is 3.
372 Args:
373 count (int, optional): If set, then the longest stretch of gaps (-) must be equal to this value. Defaults to None.
374 min (int, optional): If set, then the longest stretch of gaps (-) must be equal to or greater than this value. Defaults to None.
375 max (int, optional): If set, then the longest stretch of gaps (-) must be equal to or less than this value. Defaults to None.
376 warning (bool): If True, raise a warning instead of an exception. Defaults to False.
377 This flag can be set by running this method with the prefix `warn_` instead of `assert_`.
378 """
379 self.assert_longest_stretch(pattern='-', count=count, min=min, max=max, warning=warning)
381 def assert_startswith(self, pattern: str, *, warning: bool = False):
382 """
383 Asserts that the sequence starts with a particular pattern.
385 Args:
386 pattern (str): The sequence must start with this value.
387 warning (bool): If True, raise a warning instead of an exception. Defaults to False.
388 This flag can be set by running this method with the prefix `warn_` instead of `assert_`.
389 """
390 assert_or_warn(
391 self.seq.startswith(pattern),
392 warning,
393 f"Sequence '{self.id}' does not start with '{pattern}'.",
394 )
396 def assert_endswith(self, pattern: str, *, warning: bool = False):
397 """
398 Asserts that the sequence ends with a particular pattern.
400 Args:
401 pattern (str): The sequence must end with this value.
402 warning (bool): If True, raise a warning instead of an exception. Defaults to False.
403 This flag can be set by running this method with the prefix `warn_` instead of `assert_`.
404 """
405 assert_or_warn(
406 self.seq.endswith(pattern),
407 warning,
408 f"Sequence '{self.id}' does not end with '{pattern}'.",
409 )
411 def assert_contains(self, pattern: str, *, warning: bool = False):
412 """
413 Asserts that the sequence contains a particular pattern.
415 Args:
416 pattern (str): The sequence must contain this value.
417 warning (bool): If True, raise a warning instead of an exception. Defaults to False.
418 This flag can be set by running this method with the prefix `warn_` instead of `assert_`.
419 """
420 self.assert_count(pattern=pattern, min=1, warning=warning)