Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import re 

2from builtins import max as builtin_max 

3from typing import List, Optional, Union 

4 

5from Bio import AlignIO 

6from Bio import SeqIO as SeqIO 

7from Bio.SeqRecord import SeqRecord 

8 

9from ..utils import PhytestObject, assert_or_warn 

10 

11 

12class Sequence(PhytestObject, SeqRecord): 

13 @classmethod 

14 def parse(cls, alignment_path, alignment_format) -> 'Sequence': 

15 return ( 

16 Sequence( 

17 r.seq, 

18 id=r.id, 

19 name=r.name, 

20 description=r.description, 

21 dbxrefs=r.dbxrefs, 

22 features=r.features, 

23 annotations=r.annotations, 

24 letter_annotations=r.letter_annotations, 

25 ) 

26 for r in SeqIO.parse(alignment_path, alignment_format) 

27 ) 

28 

29 def assert_valid_alphabet(self, alphabet: str = "ATCGN-", *, warning: bool = False) -> None: 

30 """ 

31 Asserts that that the sequence only contains particular charaters. 

32 

33 Args: 

34 alphabet (str): A string containing legal charaters. Defaults to 'ATCGN-'. 

35 warning (bool): If True, raise a warning instead of an exception. Defaults to False. 

36 This flag can be set by running this method with the prefix `warn_` instead of `assert_`. 

37 """ 

38 regex_invalid = re.compile(f"[^{re.escape(alphabet)}]") 

39 result = regex_invalid.search(str(self.seq)) 

40 if result: 

41 assert_or_warn( 

42 not result, 

43 warning, 

44 f"Invalid pattern found in '{self.id}'.", 

45 f"Character '{result.group(0)}' at position {result.start(0)+1} found which is not in alphabet '{alphabet}'.", 

46 ) 

47 

48 def assert_length( 

49 self, 

50 length: Optional[int] = None, 

51 *, 

52 min: Optional[int] = None, 

53 max: Optional[int] = None, 

54 warning: bool = False, 

55 ) -> None: 

56 """ 

57 Asserts that that the sequence length meets the specified criteria. 

58 

59 Args: 

60 length (int, optional): If set, then sequence length must be equal to this value. Defaults to None. 

61 min (int, optional): If set, then sequence length must be equal to or greater than this value. Defaults to None. 

62 max (int, optional): If set, then sequence length must be equal to or less than this value. Defaults to None. 

63 warning (bool): If True, raise a warning instead of an exception. Defaults to False. 

64 This flag can be set by running this method with the prefix `warn_` instead of `assert_`. 

65 """ 

66 sequence_length = len(self.seq) 

67 if length is not None: 

68 assert_or_warn( 

69 sequence_length == length, 

70 warning, 

71 f"Sequence length of '{self.id}' ({sequence_length}) is not equal to the required length of {length}.", 

72 ) 

73 if min is not None: 

74 assert_or_warn( 

75 sequence_length >= min, 

76 warning, 

77 f"Sequence length of '{self.id}' ({sequence_length}) is less than the minimum {min}.", 

78 ) 

79 if max is not None: 

80 assert_or_warn( 

81 sequence_length <= max, 

82 warning, 

83 f"Sequence length of '{self.id}' ({sequence_length}) is greater than the maximum {max}.", 

84 ) 

85 

86 def assert_count( 

87 self, 

88 pattern: str, 

89 *, 

90 count: Optional[int] = None, 

91 min: Optional[int] = None, 

92 max: Optional[int] = None, 

93 warning: bool = False, 

94 ) -> None: 

95 """ 

96 Asserts that the count of a pattern in the sequence meets the specified criteria. 

97 

98 Args: 

99 pattern: (str): the pattern to count in the the sequence. 

100 count (int, optional): If set, then pattern count must be equal to this value. Defaults to None. 

101 min (int, optional): If set, then pattern count must be equal to or greater than this value. Defaults to None. 

102 max (int, optional): If set, then pattern count must be equal to or less than this value. Defaults to None. 

103 warning (bool): If True, raise a warning instead of an exception. Defaults to False. 

104 This flag can be set by running this method with the prefix `warn_` instead of `assert_`. 

105 """ 

106 base_count = self.seq.count(pattern) 

107 summary = f"Sequence '{self.id}' matches pattern '{pattern}' {base_count} time(s)." 

108 if count is not None: 

109 assert_or_warn( 

110 base_count == count, 

111 warning, 

112 summary, 

113 f"This is not equal to the required number of {count}.", 

114 ) 

115 if min is not None: 

116 assert_or_warn( 

117 base_count >= min, 

118 warning, 

119 summary, 

120 f"This is less than the minimum {min}.", 

121 ) 

122 if max is not None: 

123 assert_or_warn( 

124 base_count <= max, 

125 warning, 

126 summary, 

127 f"This is greater than the maximum {max}.", 

128 ) 

129 

130 def assert_percent( 

131 self, 

132 nucleotide: Union[str, List[str]], 

133 *, 

134 percent: Optional[float] = None, 

135 min: Optional[float] = None, 

136 max: Optional[float] = None, 

137 warning: bool = False, 

138 ) -> None: 

139 """ 

140 Asserts that the percentage of a nucleotide in the sequence meets the specified criteria. 

141 

142 Args: 

143 nucleotide: (Union[str, List[str]]): The nucleotide(s) to count in the the sequence. 

144 percent (float, optional): If set, then nucleotide percentage must be equal to this value. Defaults to None. 

145 min (float, optional): If set, then nucleotide percentage must be equal to or greater than this value. Defaults to None. 

146 max (float, optional): If set, then nucleotide percentage must be equal to or less than this value. Defaults to None. 

147 warning (bool): If True, raise a warning instead of an exception. Defaults to False. 

148 This flag can be set by running this method with the prefix `warn_` instead of `assert_`. 

149 """ 

150 try: 

151 if isinstance(nucleotide, str): 

152 if len(nucleotide) > 1: 

153 raise ValueError( 

154 f"The length of the requested nucleotide '{nucleotide}' is more than a single character. " 

155 "This value should either be a single character (i.e. A, G, C, T) or a list of single characters." 

156 ) 

157 base_percent = (self.seq.count(nucleotide) * 100.0) / len(self.seq) 

158 elif isinstance(nucleotide, list): 

159 base_percent = (sum(self.seq.count(x) for x in nucleotide) * 100) / len(self.seq) 

160 nucleotide = ', '.join(nucleotide) 

161 else: 

162 raise ValueError(f"Nucleotide must be str or list and cannot be of type '{type(nucleotide)}'.") 

163 except ZeroDivisionError: 

164 base_percent = 0.0 

165 summary = f"Sequence '{self.id}' contains {base_percent} percent '{nucleotide}'." 

166 if percent is not None: 

167 assert_or_warn( 

168 base_percent == percent, 

169 warning, 

170 summary, 

171 f"This is not equal to the required percentage of {percent}.", 

172 ) 

173 if min is not None: 

174 assert_or_warn( 

175 base_percent >= min, 

176 warning, 

177 summary, 

178 f"This is less than the minimum {min}.", 

179 ) 

180 if max is not None: 

181 assert_or_warn( 

182 base_percent <= max, 

183 warning, 

184 summary, 

185 f"This is greater than the maximum {max}.", 

186 ) 

187 

188 def assert_percent_GC( 

189 self, 

190 percent: Optional[int] = None, 

191 *, 

192 min: Optional[int] = None, 

193 max: Optional[int] = None, 

194 warning: bool = False, 

195 ) -> None: 

196 """ 

197 Asserts that the percent of GC's (ambiguous nucleotide S) in the sequence meets the specified criteria. 

198 

199 Args: 

200 percent (float, optional): If set, then the percentage of GC's must be equal to this value. Defaults to None. 

201 min (float, optional): If set, then the percentage of GC's must be equal to or greater than this value. Defaults to None. 

202 max (float, optional): If set, then the percentage of GC's must be equal to or less than this value. Defaults to None. 

203 warning (bool): If True, raise a warning instead of an exception. Defaults to False. 

204 This flag can be set by running this method with the prefix `warn_` instead of `assert_`. 

205 """ 

206 self.assert_percent( 

207 nucleotide=["G", "C", "g", "c", "S", "s"], percent=percent, min=min, max=max, warning=warning 

208 ) 

209 

210 def assert_percent_N( 

211 self, 

212 percent: Optional[int] = None, 

213 *, 

214 min: Optional[int] = None, 

215 max: Optional[int] = None, 

216 warning: bool = False, 

217 ) -> None: 

218 """ 

219 Asserts that the percent of N's in the sequence meets the specified criteria. 

220 

221 Args: 

222 percent (float, optional): If set, then the percentage of N's must be equal to this value. Defaults to None. 

223 min (float, optional): If set, then the percentage of N's must be equal to or greater than this value. Defaults to None. 

224 max (float, optional): If set, then the percentage of N's must be equal to or less than this value. Defaults to None. 

225 warning (bool): If True, raise a warning instead of an exception. Defaults to False. 

226 This flag can be set by running this method with the prefix `warn_` instead of `assert_`. 

227 """ 

228 self.assert_percent(nucleotide=["N", "n"], percent=percent, min=min, max=max, warning=warning) 

229 

230 def assert_percent_gaps( 

231 self, 

232 percent: Optional[int] = None, 

233 *, 

234 min: Optional[int] = None, 

235 max: Optional[int] = None, 

236 warning: bool = False, 

237 ) -> None: 

238 """ 

239 Asserts that the percent of gaps (-) in the sequence meets the specified criteria. 

240 

241 Args: 

242 percent (float, optional): If set, then the percentage of gaps must be equal to this value. Defaults to None. 

243 min (float, optional): If set, then the percentage of gaps must be equal to or greater than this value. Defaults to None. 

244 max (float, optional): If set, then the percentage of gaps must be equal to or less than this value. Defaults to None. 

245 warning (bool): If True, raise a warning instead of an exception. Defaults to False. 

246 This flag can be set by running this method with the prefix `warn_` instead of `assert_`. 

247 """ 

248 self.assert_percent(nucleotide='-', percent=percent, min=min, max=max, warning=warning) 

249 

250 def assert_count_Ns( 

251 self, 

252 count: Optional[int] = None, 

253 *, 

254 min: Optional[int] = None, 

255 max: Optional[int] = None, 

256 warning: bool = False, 

257 ) -> None: 

258 """ 

259 Asserts that the number of a N's in the sequence meets the specified criteria. 

260 

261 Args: 

262 count (int, optional): If set, then the number of N's must be equal to this value. Defaults to None. 

263 min (int, optional): If set, then the number of N's must be equal to or greater than this value. Defaults to None. 

264 max (int, optional): If set, then the number of N's must be equal to or less than this value. Defaults to None. 

265 warning (bool): If True, raise a warning instead of an exception. Defaults to False. 

266 This flag can be set by running this method with the prefix `warn_` instead of `assert_`. 

267 """ 

268 self.assert_count(pattern='N', count=count, min=min, max=max, warning=warning) 

269 

270 def assert_count_gaps( 

271 self, 

272 count: Optional[int] = None, 

273 *, 

274 min: Optional[int] = None, 

275 max: Optional[int] = None, 

276 warning: bool = False, 

277 ) -> None: 

278 """ 

279 Asserts that the number of a gaps (-) in the sequence meets the specified criteria. 

280 

281 Args: 

282 count (int, optional): If set, then the number of gaps (-) must be equal to this value. Defaults to None. 

283 min (int, optional): If set, then the number of gaps (-) must be equal to or greater than this value. Defaults to None. 

284 max (int, optional): If set, then the number of gaps (-) must be equal to or less than this value. Defaults to None. 

285 warning (bool): If True, raise a warning instead of an exception. Defaults to False. 

286 This flag can be set by running this method with the prefix `warn_` instead of `assert_`. 

287 """ 

288 self.assert_count(pattern='-', count=count, min=min, max=max, warning=warning) 

289 

290 def assert_longest_stretch( 

291 self, 

292 pattern: str, 

293 *, 

294 count: Optional[int] = None, 

295 min: Optional[int] = None, 

296 max: Optional[int] = None, 

297 warning: bool = False, 

298 ): 

299 """ 

300 Asserts that the longest stretch of a pattern in the sequence meets the specified criteria. 

301 

302 e.g. the longest stretch of N's in 'ANNNANNA' is 3. 

303 

304 Args: 

305 pattern: (str): the pattern to count in the the sequence. 

306 count (int, optional): If set, then the longest stretch of the pattern must be equal to this value. Defaults to None. 

307 min (int, optional): If set, then the longest stretch of the pattern must be equal to or greater than this value. Defaults to None. 

308 max (int, optional): If set, then the longest stretch of the pattern must be equal to or less than this value. Defaults to None. 

309 warning (bool): If True, raise a warning instead of an exception. Defaults to False. 

310 This flag can be set by running this method with the prefix `warn_` instead of `assert_`. 

311 """ 

312 matches = re.findall(f'{pattern}+', str(self.seq)) 

313 longest_stretch = len(builtin_max(matches)) if matches else 0 

314 summary = f"The longest stretch of pattern '{pattern}' in sequence '{self.id}' is {longest_stretch}." 

315 if count is not None: 

316 assert_or_warn( 

317 longest_stretch == count, 

318 warning, 

319 summary, 

320 f"This is not equal to the required number of {count}.", 

321 ) 

322 if min is not None: 

323 assert_or_warn( 

324 longest_stretch >= min, 

325 warning, 

326 summary, 

327 f"This is less than the minimum {min}.", 

328 ) 

329 if max is not None: 

330 assert_or_warn( 

331 longest_stretch <= max, 

332 warning, 

333 summary, 

334 f"This is greater than the maximum {max}.", 

335 ) 

336 

337 def assert_longest_stretch_Ns( 

338 self, 

339 count: Optional[int] = None, 

340 *, 

341 min: Optional[int] = None, 

342 max: Optional[int] = None, 

343 warning: bool = False, 

344 ): 

345 """ 

346 Asserts that the longest stretch of a N's in the sequence meets the specified criteria. 

347 

348 e.g. the logest stretch of N's in 'ANNNANNA' is 3. 

349 

350 Args: 

351 count (int, optional): If set, then the longest stretch of N's must be equal to this value. Defaults to None. 

352 min (int, optional): If set, then the longest stretch of N's must be equal to or greater than this value. Defaults to None. 

353 max (int, optional): If set, then the longest stretch of N's must be equal to or less than this value. Defaults to None. 

354 warning (bool): If True, raise a warning instead of an exception. Defaults to False. 

355 This flag can be set by running this method with the prefix `warn_` instead of `assert_`. 

356 """ 

357 self.assert_longest_stretch(pattern='N', count=count, min=min, max=max, warning=warning) 

358 

359 def assert_longest_stretch_gaps( 

360 self, 

361 count: Optional[int] = None, 

362 *, 

363 min: Optional[int] = None, 

364 max: Optional[int] = None, 

365 warning: bool = False, 

366 ): 

367 """ 

368 Asserts that the longest stretch of a gaps (-) in the sequence meets the specified criteria. 

369 

370 e.g. the logest stretch of gaps (-) in 'A---A--A' is 3. 

371 

372 Args: 

373 count (int, optional): If set, then the longest stretch of gaps (-) must be equal to this value. Defaults to None. 

374 min (int, optional): If set, then the longest stretch of gaps (-) must be equal to or greater than this value. Defaults to None. 

375 max (int, optional): If set, then the longest stretch of gaps (-) must be equal to or less than this value. Defaults to None. 

376 warning (bool): If True, raise a warning instead of an exception. Defaults to False. 

377 This flag can be set by running this method with the prefix `warn_` instead of `assert_`. 

378 """ 

379 self.assert_longest_stretch(pattern='-', count=count, min=min, max=max, warning=warning) 

380 

381 def assert_startswith(self, pattern: str, *, warning: bool = False): 

382 """ 

383 Asserts that the sequence starts with a particular pattern. 

384 

385 Args: 

386 pattern (str): The sequence must start with this value. 

387 warning (bool): If True, raise a warning instead of an exception. Defaults to False. 

388 This flag can be set by running this method with the prefix `warn_` instead of `assert_`. 

389 """ 

390 assert_or_warn( 

391 self.seq.startswith(pattern), 

392 warning, 

393 f"Sequence '{self.id}' does not start with '{pattern}'.", 

394 ) 

395 

396 def assert_endswith(self, pattern: str, *, warning: bool = False): 

397 """ 

398 Asserts that the sequence ends with a particular pattern. 

399 

400 Args: 

401 pattern (str): The sequence must end with this value. 

402 warning (bool): If True, raise a warning instead of an exception. Defaults to False. 

403 This flag can be set by running this method with the prefix `warn_` instead of `assert_`. 

404 """ 

405 assert_or_warn( 

406 self.seq.endswith(pattern), 

407 warning, 

408 f"Sequence '{self.id}' does not end with '{pattern}'.", 

409 ) 

410 

411 def assert_contains(self, pattern: str, *, warning: bool = False): 

412 """ 

413 Asserts that the sequence contains a particular pattern. 

414 

415 Args: 

416 pattern (str): The sequence must contain this value. 

417 warning (bool): If True, raise a warning instead of an exception. Defaults to False. 

418 This flag can be set by running this method with the prefix `warn_` instead of `assert_`. 

419 """ 

420 self.assert_count(pattern=pattern, min=1, warning=warning)