SeqAn3
edit_distance_unbanded.hpp
Go to the documentation of this file.
1 // ============================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ============================================================================
4 //
5 // Copyright (c) 2006-2018, Knut Reinert & Freie Universitaet Berlin
6 // Copyright (c) 2016-2018, Knut Reinert & MPI Molekulare Genetik
7 // All rights reserved.
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are met:
11 //
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
14 // * Redistributions in binary form must reproduce the above copyright
15 // notice, this list of conditions and the following disclaimer in the
16 // documentation and/or other materials provided with the distribution.
17 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
18 // its contributors may be used to endorse or promote products derived
19 // from this software without specific prior written permission.
20 //
21 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
25 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
31 // DAMAGE.
32 //
33 // ============================================================================
34 
40 #pragma once
41 
42 #include <algorithm>
43 #include <bitset>
44 #include <utility>
45 
46 #include <range/v3/algorithm/copy.hpp>
47 #include <range/v3/utility/iterator.hpp>
48 
56 #include <seqan3/std/ranges>
57 
58 namespace seqan3::detail
59 {
61 template <typename align_config_t>
62 concept semi_global_config_concept = requires (align_config_t & cfg)
63 {
64  requires get<align_cfg::id::sequence_ends>(cfg) == free_ends_at::seq1;
65 };
66 
67 template <typename align_config_t>
68 concept global_config_concept = has_align_cfg_v<align_cfg::id::global, std::remove_reference_t<align_config_t>>;
69 
70 template <typename align_config_t>
71 concept max_errors_concept = has_align_cfg_v<align_cfg::id::max_error, std::remove_reference_t<align_config_t>>;
73 
77 template <typename traits_type>
78 concept edit_distance_trait_concept = requires
79 {
80  typename std::remove_reference_t<traits_type>::word_type;
81 };
82 
86 struct default_edit_distance_trait_type
87 {
89  using word_type = uint64_t;
90 };
91 
98 template <std::ranges::ViewableRange database_t,
100  typename align_config_t,
101  edit_distance_trait_concept traits_t = default_edit_distance_trait_type>
102 class pairwise_alignment_edit_distance_unbanded
103 {
107  friend alignment_score_matrix<pairwise_alignment_edit_distance_unbanded>;
110  friend alignment_trace_matrix<pairwise_alignment_edit_distance_unbanded>;
112 
114  database_t database;
116  query_t query;
118  align_config_t config;
119 
120 public:
122  using word_type = typename std::remove_reference_t<traits_t>::word_type;
124  using score_type = int;
126  using database_type = std::remove_reference_t<database_t>;
128  using query_type = std::remove_reference_t<query_t>;
130  using score_matrix_type = detail::alignment_score_matrix<pairwise_alignment_edit_distance_unbanded>;
132  using trace_matrix_type = detail::alignment_trace_matrix<pairwise_alignment_edit_distance_unbanded>;
133 
135  static constexpr uint8_t word_size = sizeof(word_type) * 8;
136 
137 private:
139  using database_iterator = std::ranges::iterator_t<database_type>;
141  using query_alphabet_type = std::remove_reference_t<reference_t<query_type>>;
142 
143  //TODO Make it dynamic.
144  // using result_type = align_result<type_list<uint32_t, int>>;
145 
147  static constexpr bool use_max_errors = detail::max_errors_concept<align_config_t>;
149  static constexpr bool is_semi_global = detail::semi_global_config_concept<align_config_t>;
151  static constexpr bool is_global = detail::global_config_concept<align_config_t> && !is_semi_global;
152 
154  static constexpr word_type hp0 = is_global ? 1 : 0;
155 
156  static_assert(8 * sizeof(word_type) <= 64, "we assume at most uint64_t as word_type");
157  static_assert((is_global && !is_semi_global) || (!is_global && is_semi_global), "Either set global or semi-global");
158 
160  score_type _score{};
163  word_type score_mask{0};
165  std::vector<word_type> vp{};
167  std::vector<word_type> vn{};
170  std::vector<word_type> bit_masks{};
175  score_type _best_score{};
185  database_iterator _best_score_col{};
186 
190  score_type max_errors{255};
193  size_t last_block{0};
195  word_type last_score_mask{};
197 
199  database_iterator database_it{};
201  database_iterator database_it_end{};
202 
204  struct state_type
205  {
207  std::vector<word_type> vp{};
209  std::vector<word_type> vn{};
210  };
211 
213  std::vector<state_type> states{};
214 
216  void add_state()
217  {
218  states.push_back(state_type{vp, vn});
219  }
220 
221 public:
222 
228  pairwise_alignment_edit_distance_unbanded() = delete;
229  pairwise_alignment_edit_distance_unbanded(pairwise_alignment_edit_distance_unbanded const &) = default;
230  pairwise_alignment_edit_distance_unbanded(pairwise_alignment_edit_distance_unbanded &&) = default;
231  pairwise_alignment_edit_distance_unbanded & operator=(pairwise_alignment_edit_distance_unbanded const &) = default;
232  pairwise_alignment_edit_distance_unbanded & operator=(pairwise_alignment_edit_distance_unbanded &&) = default;
233 
239  pairwise_alignment_edit_distance_unbanded(database_t && _database, query_t && _query, align_config_t _config) :
240  database{std::forward<database_t>(_database)},
241  query{std::forward<query_t>(_query)},
242  config{std::forward<align_config_t>(_config)},
243  _score{static_cast<score_type>(query.size())},
244  _best_score{static_cast<score_type>(query.size())},
245  _best_score_col{ranges::begin(database)},
246  database_it{ranges::begin(database)},
247  database_it_end{ranges::end(database)}
248  {
249  static constexpr size_t alphabet_size = alphabet_size_v<query_alphabet_type>;
250 
251  if constexpr(use_max_errors)
252  {
253  max_errors = get<align_cfg::id::max_error>(config);
254  assert(max_errors >= score_type{0});
255  }
256 
257  size_t block_count = (query.size() - 1 + word_size) / word_size;
258  score_mask = (word_type)1 << ((query.size() - 1 + word_size) % word_size);
259  last_score_mask = score_mask;
260  last_block = block_count - 1;
261 
262  if constexpr(use_max_errors)
263  {
264  // localMaxErrors either stores the maximal number of _score (me.max_errors) or the needle size minus one.
265  // It is used for the mask computation and setting the initial score (the minus one is there because of the Ukkonen trick).
266  size_t localMaxErrors = std::min<size_t>(max_errors, query.size() - 1);
267  score_mask = (word_type)1 << (localMaxErrors % word_size);
268  last_block = std::min(localMaxErrors / word_size, block_count - 1);
269  _score = localMaxErrors + 1;
270  _best_score = _score;
271  }
272 
273  word_type vp0{static_cast<word_type>(~0)};
274  word_type vn0{0};
275 
276  vp.resize(block_count, vp0);
277  vn.resize(block_count, vn0);
278  bit_masks.resize((alphabet_size + 1) * block_count, 0);
279 
280  // encoding the letters as bit-vectors
281  for (size_t j = 0; j < query.size(); j++)
282  {
283  size_t i = block_count * to_rank(query[j]) + j / word_size;
284  bit_masks[i] |= (word_type)1 << (j % word_size);
285  }
286 
287  add_state();
288  }
290 
291 private:
292 
294  template <bool with_overflow_check>
295  void compute_step(word_type b, word_type & hp, word_type & hn, word_type & vp, word_type & vn, word_type & carry_d0, word_type & carry_hp, word_type & carry_hn)
296  {
297  word_type x, d0, t;
298 
299  x = b | vn;
300  t = vp + (x & vp) + (with_overflow_check ? carry_d0 : 0);
301 
302  d0 = (t ^ vp) | x;
303  hn = vp & d0;
304  hp = vn | ~(vp | d0);
305 
306  if constexpr(with_overflow_check)
307  carry_d0 = (carry_d0 != (word_type)0) ? t <= vp : t < vp;
308 
309  x = (hp << 1) | (with_overflow_check ? carry_hp : hp0);
310  vn = x & d0;
311  vp = (hn << 1) | ~(x | d0) | (with_overflow_check ? carry_hn : 0);
312 
313  if constexpr(with_overflow_check)
314  {
315  carry_hp = hp >> (word_size - 1);
316  carry_hn = hn >> (word_size - 1);
317  }
318  }
319 
321  void advance_score(word_type P, word_type N, word_type mask)
322  {
323  if ((P & mask) != (word_type)0)
324  _score++;
325  else if ((N & mask) != (word_type)0)
326  _score--;
327 
328  if constexpr(is_semi_global)
329  {
330  _best_score_col = (_score <= _best_score) ? database_it : _best_score_col;
331  _best_score = (_score <= _best_score) ? _score : _best_score;
332  }
333  }
334 
336  bool prev_last_active_cell()
337  {
338  score_mask >>= 1;
339  if (score_mask != (word_type)0)
340  return true;
341 
342  if (is_global && last_block == 0)
343  return false;
344 
345  last_block--;
346 
347  score_mask = (word_type)1 << (word_size - 1);
348  return true;
349  }
350 
352  void next_last_active_cell()
353  {
354  score_mask <<= 1;
355  if (score_mask)
356  return;
357 
358  score_mask = 1;
359  last_block++;
360  }
361 
363  bool update_last_active_cell()
364  {
365  // updating the last active cell
366  while (!(_score <= max_errors))
367  {
368  advance_score(vn[last_block], vp[last_block], score_mask);
369  if (!prev_last_active_cell())
370  break;
371  }
372 
373  if ((score_mask == last_score_mask) && (last_block == vp.size() - 1))
374  return on_hit();
375  else
376  {
377  next_last_active_cell();
378  advance_score(vp[last_block], vn[last_block], score_mask);
379  }
380 
381  return false;
382  }
383 
385  bool on_hit()
386  {
387  assert(_score <= max_errors);
388  // _setFinderEnd(finder);
389  //
390  // if constexpr(is_global)
391  // _setFinderLength(finder, endPosition());
392 
393  return false;
394  }
395 
398  inline bool small_patterns();
399 
401  inline bool large_patterns();
402 
404  void _compute()
405  {
406  // limit search width for prefix search
407  if constexpr(use_max_errors && is_global)
408  {
409  size_t max_length = query.size() + max_errors + 1;
410  size_t haystack_length = std::min(database.size(), max_length);
411  database_it_end -= database.size() - haystack_length;
412  }
413 
414  // distinguish between the version for needles not longer than
415  // one machine word and the version for longer needles
416  if (vp.size() <= 1)
417  small_patterns();
418  else
419  large_patterns();
420 
421  if constexpr(is_global)
422  _best_score = _score;
423  }
424 
425 public:
426 
431  template <typename result_type>
432  result_type & operator()(result_type & res)
433  {
434  _compute();
435  if constexpr (std::tuple_size_v<result_type> >= 2)
436  {
437  get<align_result_key::score>(res) = score();
438  }
439 
440  if constexpr (std::tuple_size_v<result_type> >= 3)
441  {
442  get<align_result_key::end>(res) = end_coordinate();
443  }
444 
445  [[maybe_unused]] alignment_trace_matrix matrix = trace_matrix();
446  if constexpr (std::tuple_size_v<result_type> >= 4)
447  {
448  get<align_result_key::begin>(res) = alignment_begin_coordinate(matrix, get<align_result_key::end>(res));
449  }
450 
451  if constexpr (std::tuple_size_v<result_type> >= 5)
452  {
453  get<align_result_key::trace>(res) = alignment_trace(database, query, matrix, get<align_result_key::end>(res));
454  }
455  return res;
456  }
457 
459  score_type score() const noexcept
460  {
461  return -_best_score;
462  }
463 
465  score_matrix_type score_matrix() const noexcept
466  {
467  return score_matrix_type{*this};
468  }
469 
471  trace_matrix_type trace_matrix() const noexcept
472  {
473  return trace_matrix_type{*this};
474  }
475 
477  alignment_coordinate begin_coordinate() const noexcept
478  {
479  alignment_coordinate end = end_coordinate();
480  return alignment_begin_coordinate(trace_matrix(), end);
481  }
482 
484  alignment_coordinate end_coordinate() const noexcept
485  {
486  size_t col = database.size() - 1;
487  if constexpr(is_semi_global)
488  col = std::distance(begin(database), _best_score_col);
489 
490  return {col, query.size() - 1};
491  }
492 
494  auto trace() const noexcept
495  {
496  return alignment_trace(database, query, trace_matrix(), end_coordinate());
497  }
498 };
499 
500 template <typename database_t, typename query_t, typename align_config_t, typename traits_t>
501 bool pairwise_alignment_edit_distance_unbanded<database_t, query_t, align_config_t, traits_t>::small_patterns()
502 {
503  // computing the blocks
504  while (database_it != database_it_end)
505  {
506  word_type hn, hp, _;
507 
508  word_type b = bit_masks[to_rank((query_alphabet_type) *database_it)];
509  compute_step<false>(b, hp, hn, vp[0], vn[0], _, _, _);
510  advance_score(hp, hn, score_mask);
511 
512  if constexpr(use_max_errors)
513  if (_score <= max_errors && on_hit())
514  {
515  add_state();
516  ++database_it;
517  return true;
518  }
519 
520  add_state();
521  ++database_it;
522  }
523 
524  return false;
525 }
526 
527 template <typename database_t, typename query_t, typename align_config_t, typename traits_t>
528 bool pairwise_alignment_edit_distance_unbanded<database_t, query_t, align_config_t, traits_t>::large_patterns()
529 {
530  while (database_it != database_it_end)
531  {
532  word_type hn, hp;
533  word_type carry_d0{0}, carry_hp{hp0}, carry_hn{0};
534  size_t block_offset = vp.size() * to_rank((query_alphabet_type) *database_it);
535 
536  // computing the necessary blocks, carries between blocks following one another are stored
537  for (size_t current_block = 0; current_block <= last_block; current_block++)
538  {
539  word_type b = bit_masks[block_offset + current_block];
540  compute_step<true>(b, hp, hn, vp[current_block], vn[current_block], carry_d0, carry_hp, carry_hn);
541  }
542  advance_score(hp, hn, score_mask);
543 
544  if constexpr(use_max_errors)
545  {
546  // if the active cell is the last of it's block, one additional block has to be calculated
547  bool additional_block = score_mask >> (word_size - 1);
548  if (last_block + 1 == vp.size())
549  additional_block = false;
550 
551  if (additional_block)
552  {
553  size_t current_block = last_block + 1;
554  word_type b = bit_masks[block_offset + current_block];
555  compute_step<false>(b, hp, hn, vp[current_block], vn[current_block], carry_d0, carry_hp, carry_hn);
556  }
557 
558  // updating the last active cell
559  if (update_last_active_cell())
560  {
561  add_state();
562  ++database_it;
563  return true;
564  }
565  }
566 
567  add_state();
568  ++database_it;
569  }
570 
571  return false;
572 }
573 
578 template<typename database_t, typename query_t, typename config_t>
579 pairwise_alignment_edit_distance_unbanded(database_t && database, query_t && query, config_t config)
580  -> pairwise_alignment_edit_distance_unbanded<database_t, query_t, config_t>;
581 
582 template<typename database_t, typename query_t, typename config_t, typename traits_t>
583 pairwise_alignment_edit_distance_unbanded(database_t && database, query_t && query, config_t config, traits_t)
584  -> pairwise_alignment_edit_distance_unbanded<database_t, query_t, config_t, traits_t>;
586 
588 template<typename database_t, typename query_t, typename align_config_t, typename traits_t>
589 class alignment_score_matrix<pairwise_alignment_edit_distance_unbanded<database_t, query_t, align_config_t, traits_t>>
590  : public alignment_score_matrix<std::vector<typename pairwise_alignment_edit_distance_unbanded<database_t, query_t, align_config_t, traits_t>::score_type>>
591 {
592 public:
593 
594  using alignment_type = pairwise_alignment_edit_distance_unbanded<database_t, query_t, align_config_t, traits_t>;
595  using score_type = typename alignment_type::score_type;
596  using base_score_matrix_type = alignment_score_matrix<std::vector<score_type>>;
597  using word_type = typename alignment_type::word_type;
598 
599  static constexpr size_t word_size = sizeof(word_type)*8;
600 
606  alignment_score_matrix() = default;
607  alignment_score_matrix(alignment_score_matrix const &) = default;
608  alignment_score_matrix(alignment_score_matrix &&) = default;
609  alignment_score_matrix & operator=(alignment_score_matrix const &) = default;
610  alignment_score_matrix & operator=(alignment_score_matrix &&) = default;
611 
612  alignment_score_matrix(alignment_type const & alignment) :
613  base_score_matrix_type
614  {
615  [&]{
616  size_t _cols = alignment.database.size() + 1;
617  size_t _rows = alignment.query.size() + 1;
618  std::vector<score_type> scores{};
619  scores.reserve(_cols * _rows);
620 
621  // init first row with 0, 1, 2, 3, ...
622  for (size_t col = 0; col < _cols; ++col)
623  scores[col] = alignment_type::is_global ? col : 0;
624 
625  auto deltas = [&](size_t col)
626  {
627  return [state = alignment.states[col]](size_t row)
628  {
629  using bitset = std::bitset<word_size>;
630 
631  size_t chunk = row / word_size;
632  size_t row_in_chunk = row % word_size;
633  word_type vp = state.vp[chunk];
634  word_type vn = state.vn[chunk];
635 
636  int8_t p = bitset(vp)[row_in_chunk] ? 1 : 0;
637  int8_t n = bitset(vn)[row_in_chunk] ? 1 : 0;
638  return p - n;
639  };
640  };
641 
642  for (size_t col = 0; col < _cols; ++col)
643  {
644  auto delta = deltas(col);
645  for (size_t row = 1; row < _rows; ++row)
646  scores[row * _cols + col] = scores[(row - 1) * _cols + col] + delta(row - 1);
647  }
648 
649  return scores;
650  }(),
651  alignment.query.size() + 1,
652  alignment.database.size() + 1
653  }
654  {
655  }
656  //\}
657 };
658 
659 template<typename database_t, typename query_t, typename align_config_t, typename traits_t>
660 class alignment_trace_matrix<pairwise_alignment_edit_distance_unbanded<database_t, query_t, align_config_t, traits_t>>
661  : public alignment_trace_matrix<database_t const &, query_t const &, align_config_t, alignment_score_matrix<pairwise_alignment_edit_distance_unbanded<database_t, query_t, align_config_t, traits_t>>>
662 {
663 public:
664 
665  using alignment_type = pairwise_alignment_edit_distance_unbanded<database_t, query_t, align_config_t, traits_t>;
666  using score_matrix_type = alignment_score_matrix<alignment_type>;
667  using base_trace_matrix_type = alignment_trace_matrix<database_t const &, query_t const &, align_config_t, score_matrix_type>;
668 
674  alignment_trace_matrix() = default;
675  alignment_trace_matrix(alignment_trace_matrix const &) = default;
676  alignment_trace_matrix(alignment_trace_matrix &&) = default;
677  alignment_trace_matrix & operator=(alignment_trace_matrix const &) = default;
678  alignment_trace_matrix & operator=(alignment_trace_matrix &&) = default;
679 
680  alignment_trace_matrix(alignment_type const & alignment) :
681  base_trace_matrix_type{alignment.database, alignment.query, alignment.config, score_matrix_type{alignment}}
682  {
683  }
685 };
686 
688 
689 } // namespace seqan3::detail
Contains the declaration of seqan3::detail::alignment_score_matrix.
Meta-header for the alignment configuration module .
Contains various shortcuts for common std::ranges functions.
Continuous gaps in the beginning and end of the first sequence are not scored.
::ranges::iterator_t iterator_t
Alias for ranges::iterator_t. Obtains the iterator type of a range.
Definition: ranges:225
Adaptations of concepts from the Ranges TS.
::ranges::begin begin
Alias for ranges::begin. Returns an iterator to the beginning of a range.
Definition: ranges:185
Contains the declaration of seqan3::detail::alignment_trace_matrix.
Specifies the requirements of a Range type that is either a std::ranges::View or an lvalue-reference...
Definition: aligned_sequence_concept.hpp:288
Provides seqan3::align_result.
Provides seqan3::detail::alignment_coordinate.
Contains algorithms that operate on seqan3::detail::alignment_trace_matrix.
::ranges::end end
Alias for ranges::end. Returns an iterator to the end of a range.
Definition: ranges:190
constexpr underlying_rank_t< alphabet_type > to_rank(alphabet_type const alph) requires requires(alphabet_type alph)
Implementation of seqan3::semi_alphabet_concept::to_rank() that delegates to a member function...
Definition: member_exposure.hpp:97
constexpr detail::align_config_score_adaptor score
A configuration adaptor for alignment scoring.
Definition: align_config_score.hpp:117