SeqAn3
search_scheme_algorithm.hpp
Go to the documentation of this file.
1 // ============================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ============================================================================
4 //
5 // Copyright (c) 2006-2018, Knut Reinert & Freie Universitaet Berlin
6 // Copyright (c) 2016-2018, Knut Reinert & MPI Molekulare Genetik
7 // All rights reserved.
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are met:
11 //
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
14 // * Redistributions in binary form must reproduce the above copyright
15 // notice, this list of conditions and the following disclaimer in the
16 // documentation and/or other materials provided with the distribution.
17 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
18 // its contributors may be used to endorse or promote products derived
19 // from this software without specific prior written permission.
20 //
21 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
25 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
31 // DAMAGE.
32 //
33 // ============================================================================
34 
40 #pragma once
41 
42 #include <type_traits>
43 
44 #include <range/v3/view/slice.hpp>
45 
51 
52 namespace seqan3::detail
53 {
54 
71 inline std::vector<search_dyn> compute_ss(uint8_t const min_error, uint8_t const max_error)
72 {
73  // TODO: Replace this at least by the pigeonhole principle or even better by 01*0 schemes.
74  // NOTE: Make sure that the searches are sorted by their asymptotical running time (i.e. upper error bound string),
75  // s.t. easy to compute searches come first. This improves the running time of algorithms that abort after the
76  // first hit (e.g. search mode: best). Even though it is not guaranteed, this seems to be a good greedy
77  // approach.
78  std::vector<search_dyn> scheme{{{1}, {min_error}, {max_error}}};
79  return scheme;
80 }
81 
98 template <typename search_scheme_t>
99 inline auto search_scheme_block_info(search_scheme_t const & search_scheme, size_t const query_length)
100 {
101  using blocks_length_type = typename search_scheme_t::value_type::blocks_length_type;
102 
103  bool constexpr is_dyn_scheme = std::Same<search_scheme_t, search_scheme_dyn_type>;
104 
105  // Either store information in an array (for search schemes known at compile time) or in a vector otherwise.
106  using result_type = std::conditional_t<is_dyn_scheme,
107  std::vector<std::tuple<blocks_length_type, size_t>>,
108  std::array<std::tuple<blocks_length_type, size_t>,
110  std::false_type>::value>>;
111 
112  result_type result;
113  if constexpr (is_dyn_scheme)
114  result.resize(search_scheme.size());
115 
116  uint8_t const blocks {search_scheme[0].blocks()};
117  size_t const block_length{query_length / blocks};
118  uint8_t const rest {static_cast<uint8_t>(query_length % blocks)};
119 
120  blocks_length_type blocks_length;
121  // set all blocks_length values to block_length
122  // resp. block_length + 1 for the first `rest = block_length % blocks` values
123  if constexpr (is_dyn_scheme)
124  blocks_length.resize(blocks, block_length);
125  else
126  blocks_length.fill(block_length);
127 
128  for (uint8_t block_id = 0; block_id < rest; ++block_id)
129  ++blocks_length[block_id];
130 
131  for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
132  {
133  auto const & search = search_scheme[search_id];
134 
135  auto & [search_blocks_length, start_pos] = result[search_id];
136 
137  // compute cumulative blocks_length and starting position
138  start_pos = 0;
139  if constexpr (is_dyn_scheme)
140  search_blocks_length.resize(blocks);
141  search_blocks_length[0] = blocks_length[search.pi[0] - 1];
142  for (uint8_t i = 1; i < blocks; ++i)
143  {
144  search_blocks_length[i] = blocks_length[search.pi[i] - 1] + search_blocks_length[i - 1];
145  if (search.pi[i] < search.pi[0])
146  start_pos += search_blocks_length[i] - search_blocks_length[i - 1];
147  }
148  }
149 
150  return result;
151 }
152 
154 // forward declaration
155 template <bool abort_on_hit, typename iterator_t, typename query_t, typename search_t, typename blocks_length_t,
156  typename delegate_t>
157 inline bool search_ss(iterator_t it, query_t & query,
158  typename iterator_t::size_type const lb, typename iterator_t::size_type const rb,
159  uint8_t const errors_spent, uint8_t const block_id, bool const go_right, search_t const & search,
160  blocks_length_t const & blocks_length, search_param const error_left, delegate_t && delegate);
162 
193 template <bool abort_on_hit, typename iterator_t, typename query_t, typename search_t, typename blocks_length_t,
194  typename delegate_t>
195 inline bool search_ss_exact(iterator_t it, query_t & query,
196  typename iterator_t::size_type const lb, typename iterator_t::size_type const rb,
197  uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
198  search_t const & search, blocks_length_t const & blocks_length,
199  search_param const error_left, delegate_t && delegate)
200 {
201  using size_type = typename iterator_t::size_type;
202 
203  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
204  bool const go_right2 = (block_id < search.blocks() - 1) && (search.pi[block_id + 1] > search.pi[block_id]);
205 
206  if (go_right)
207  {
208  size_type const infix_lb = rb - 1; // inclusive
209  size_type const infix_rb = lb + blocks_length[block_id] - 1; // exclusive
210 
211  if (!it.extend_right(query | ranges::view::slice(infix_lb, infix_rb + 1)))
212  return false;
213 
214  if (search_ss<abort_on_hit>(it, query, lb, infix_rb + 2, errors_spent, block_id2, go_right2, search,
215  blocks_length, error_left, delegate) && abort_on_hit)
216  {
217  return true;
218  }
219  }
220  else
221  {
222  size_type const infix_lb = rb - blocks_length[block_id] - 1; // inclusive
223  size_type const infix_rb = lb - 1; // inclusive
224 
225  if (!it.extend_left(query | ranges::view::slice(infix_lb, infix_rb + 1)))
226  return false;
227 
228  if (search_ss<abort_on_hit>(it, query, infix_lb, rb, errors_spent, block_id2, go_right2, search, blocks_length,
229  error_left, delegate) && abort_on_hit)
230  {
231  return true;
232  }
233  }
234  return false;
235 }
236 
242 template <bool abort_on_hit, typename iterator_t, typename query_t, typename search_t, typename blocks_length_t,
243  typename delegate_t>
244 inline bool search_ss_deletion(iterator_t it, query_t & query,
245  typename iterator_t::size_type const lb, typename iterator_t::size_type const rb,
246  uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
247  search_t const & search, blocks_length_t const & blocks_length,
248  search_param const error_left, delegate_t && delegate)
249 {
250  uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
251  uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0);
252 
253  // Switch to the next block when the min number of errors is reached
254  if (min_error_left_in_block == 0)
255  {
256  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
257  bool const go_right2 = search.pi[block_id2] > search.pi[block_id2 - 1];
258 
259  if (search_ss<abort_on_hit>(it, query, lb, rb, errors_spent, block_id2, go_right2, search, blocks_length,
260  error_left, delegate) && abort_on_hit)
261  {
262  return true;
263  }
264  }
265 
266  // Insert deletions into the current block as long as possible
267  // Do not allow deletions at the beginning of the leftmost block
268  // Do not allow deletions at the end of the rightmost block
269  if (!(search.pi[block_id] == 1 && !go_right) &&
270  !(search.pi[block_id] == search.blocks() && go_right) &&
271  max_error_left_in_block > 0 && error_left.total > 0 && error_left.deletion > 0 &&
272  ((go_right && it.extend_right()) || (!go_right && it.extend_left())))
273  {
274  search_param error_left2{error_left};
275  error_left2.total--;
276  error_left2.deletion--;
277  do
278  {
279  if (search_ss_deletion<abort_on_hit>(it, query, lb, rb, errors_spent + 1, block_id, go_right, search,
280  blocks_length, error_left2, delegate) && abort_on_hit)
281  {
282  return true;
283  }
284  } while ((go_right && it.cycle_back()) || (!go_right && it.cycle_front()));
285  }
286  return false;
287 }
288 
296 template <bool abort_on_hit, typename iterator_t, typename query_t, typename search_t, typename blocks_length_t,
297  typename delegate_t>
298 inline bool search_ss_children(iterator_t it, query_t & query,
299  typename iterator_t::size_type const lb, typename iterator_t::size_type const rb,
300  uint8_t const errors_spent, uint8_t const block_id, bool const go_right,
301  uint8_t const min_error_left_in_block, search_t const & search,
302  blocks_length_t const & blocks_length, search_param const error_left,
303  delegate_t && delegate)
304 {
305  using size_type = typename iterator_t::size_type;
306  if ((go_right && it.extend_right()) || (!go_right && it.extend_left()))
307  {
308  size_type const chars_left = blocks_length[block_id] - (rb - lb - 1);
309 
310  size_type lb2 = lb - !go_right;
311  size_type rb2 = rb + go_right;
312 
313  do
314  {
315  bool const delta = it.last_char() != query[(go_right ? rb : lb) - 1];
316 
317  // skip if there are more min errors left in the current block than characters in the block
318  // i.e. chars_left - 1 < min_error_left_in_block - delta
319  // TODO: move that outside the if / do-while struct
320  // TODO: incorporate error_left.deletion into formula
321  if (error_left.deletion == 0 && chars_left + delta < min_error_left_in_block + 1u)
322  continue;
323 
324  if (!delta || error_left.substitution > 0)
325  {
326  search_param error_left2{error_left};
327  error_left2.total -= delta;
328  error_left2.substitution -= delta;
329 
330  // At the end of the current block
331  if (rb - lb == blocks_length[block_id])
332  {
333  // Leave the possibility for one or multiple deletions at the end of a block.
334  // Thus do not change the direction (go_right) yet.
335  if (error_left.deletion > 0)
336  {
337  if (search_ss_deletion<abort_on_hit>(it, query, lb2, rb2, errors_spent + delta, block_id,
338  go_right, search, blocks_length, error_left2, delegate) &&
339  abort_on_hit)
340  {
341  return true;
342  }
343  }
344  else
345  {
346  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
347  bool const go_right2 = search.pi[block_id2] > search.pi[block_id2 - 1];
348 
349  if (search_ss<abort_on_hit>(it, query, lb2, rb2, errors_spent + delta, block_id2, go_right2,
350  search, blocks_length, error_left2, delegate) &&
351  abort_on_hit)
352  {
353  return true;
354  }
355  }
356  }
357  else
358  {
359  if (search_ss<abort_on_hit>(it, query, lb2, rb2, errors_spent + delta, block_id, go_right, search,
360  blocks_length, error_left2, delegate) && abort_on_hit)
361  {
362  return true;
363  }
364  }
365  }
366 
367  // Deletion
368  // TODO: check whether the conditions for deletions at the beginning/end of the query are really necessary
369  // No deletion at the beginning of the leftmost block.
370  // No deletion at the end of the rightmost block.
371  if (error_left.deletion > 0 &&
372  !(go_right && (rb == 1 || rb == query.size() + 1)) &&
373  !(!go_right && (lb == 0 || lb == query.size())))
374  {
375  search_param error_left3{error_left};
376  error_left3.total--;
377  error_left3.deletion--;
378  search_ss<abort_on_hit>(it, query, lb, rb, errors_spent + 1, block_id, go_right, search, blocks_length,
379  error_left3, delegate);
380  }
381  } while ((go_right && it.cycle_back()) || (!go_right && it.cycle_front()));
382  }
383  return false;
384 }
385 
390 template <bool abort_on_hit, typename iterator_t, typename query_t, typename search_t,
391  typename blocks_length_t, typename delegate_t>
392 inline bool search_ss(iterator_t it, query_t & query,
393  typename iterator_t::size_type const lb, typename iterator_t::size_type const rb,
394  uint8_t const errors_spent, uint8_t const block_id, bool const go_right, search_t const & search,
395  blocks_length_t const & blocks_length, search_param const error_left, delegate_t && delegate)
396 {
397  uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
398  uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0); // NOTE: changed
399 
400  // Done.
401  if (min_error_left_in_block == 0 && lb == 0 && rb == query.size() + 1)
402  {
403  delegate(it);
404  return true;
405  }
406  // Exact search in current block.
407  else if (((max_error_left_in_block == 0) && (rb - lb - 1 != blocks_length[block_id])) ||
408  (error_left.total == 0 && min_error_left_in_block == 0))
409  {
410  if (search_ss_exact<abort_on_hit>(it, query, lb, rb, errors_spent, block_id, go_right, search, blocks_length,
411  error_left, delegate) && abort_on_hit)
412  {
413  return true;
414  }
415  }
416  // Approximate search in current block.
417  // i.e. blocks_length[block_id] - (rb - lb - (lb != rb)) >= min_error_left_in_block
418  else if (error_left.total > 0)
419  {
420  // Insertion
421  if (error_left.insertion > 0)
422  {
423  using size_type = typename iterator_t::size_type;
424 
425  size_type const lb2 = lb - !go_right;
426  size_type const rb2 = rb + go_right;
427 
428  search_param error_left2{error_left};
429  error_left2.total--;
430  error_left2.insertion--;
431  // At the end of the current block
432  if (rb - lb == blocks_length[block_id])
433  {
434  // Leave the possibility for one or multiple deletions at the end of a block.
435  // Thus do not change the direction (go_right) yet.
436  // TODO: benchmark the improvement on preventing insertions followed by a deletion and vice versa. Does
437  // it pay off the additional complexity and documentation for the user? (Note that the user might only
438  // allow for insertions and deletion and not for mismatches).
439  if (search_ss_deletion<abort_on_hit>(it, query, lb2, rb2, errors_spent + 1, block_id, go_right, search,
440  blocks_length, error_left2, delegate) && abort_on_hit)
441  {
442  return true;
443  }
444  }
445  else
446  {
447  if (search_ss<abort_on_hit>(it, query, lb2, rb2, errors_spent + 1, block_id, go_right, search,
448  blocks_length, error_left2, delegate) && abort_on_hit)
449  {
450  return true;
451  }
452  }
453  }
454  if (search_ss_children<abort_on_hit>(it, query, lb, rb, errors_spent, block_id, go_right,
455  min_error_left_in_block, search, blocks_length, error_left, delegate) &&
456  abort_on_hit)
457  {
458  return true;
459  }
460  }
461  return false;
462 }
463 
485 template <bool abort_on_hit, typename index_t, typename query_t, typename search_scheme_t, typename delegate_t>
486 inline void search_ss(index_t const & index, query_t & query, search_param const error_left,
487  search_scheme_t const & search_scheme, delegate_t && delegate)
488 {
489  // retrieve cumulative block lengths and starting position
490  auto const block_info = search_scheme_block_info(search_scheme, query.size());
491 
492  for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
493  {
494  auto const & search = search_scheme[search_id];
495  auto const & [blocks_length, start_pos] = block_info[search_id];
496 
497  bool const hit = search_ss<abort_on_hit>(
498  index.begin(), // iterator on the index
499  query, // query to be searched
500  start_pos, start_pos + 1, // infix range already searched (open interval)
501  // the first character of `query` has the index 1 (not 0)
502  0, // errors spent
503  0, // current block id in search scheme
504  true, // search the first block from left to right
505  search, blocks_length, // search scheme information
506  error_left, // errors left (broken down by error types)
507  delegate // delegate function called on hit
508  );
509 
510  if (abort_on_hit && hit)
511  return;
512  }
513 }
514 
534 template <bool abort_on_hit, typename index_t, typename query_t, typename delegate_t>
535 inline void search_algo_bi(index_t const & index, query_t & query, search_param const error_left,
536  delegate_t && delegate)
537 {
538  switch (error_left.total)
539  {
540  case 0:
541  search_ss<abort_on_hit>(index, query, error_left, optimum_search_scheme<0, 0>, delegate);
542  break;
543  case 1:
544  search_ss<abort_on_hit>(index, query, error_left, optimum_search_scheme<0, 1>, delegate);
545  break;
546  case 2:
547  search_ss<abort_on_hit>(index, query, error_left, optimum_search_scheme<0, 2>, delegate);
548  break;
549  case 3:
550  search_ss<abort_on_hit>(index, query, error_left, optimum_search_scheme<0, 3>, delegate);
551  break;
552  default:
553  auto const & search_scheme{compute_ss(0, error_left.total)};
554  search_ss<abort_on_hit>(index, query, error_left, search_scheme, delegate);
555  break;
556  }
557 }
558 
563 template <bool abort_on_hit, typename index_t, typename query_t, typename delegate_t>
564 inline void search_algo_uni(index_t const & index, query_t & query, search_param const error_left,
565  delegate_t && delegate)
566 {
567  search_trivial<abort_on_hit>(index, query, error_left, delegate);
568 }
569 
574 template <bool abort_on_hit, typename index_t, typename query_t, typename delegate_t>
575 inline void search_algo(index_t const & index, query_t & query, search_param const error_left, delegate_t && delegate)
576 {
577  if constexpr (bi_fm_index_concept<index_t>)
578  search_algo_bi<abort_on_hit>(index, query, error_left, delegate);
579  else
580  search_algo_uni<abort_on_hit>(index, query, error_left, delegate);
581 }
582 
584 
585 } // namespace seqan3::detail
Provides the data structures and precomputed instances for (optimum) search schemes.
Provides the concepts for seqan3::fm_index and seqan3::bi_fm_index and its traits and iterators...
Provides seqan3::detail::transformation_trait_or.
auto search(index_t const &index, queries_t &&queries, configuration_t const &cfg)
Search a query or a range of queries in an index.
Definition: search.hpp:82
::ranges::iterator_t iterator_t
Alias for ranges::iterator_t. Obtains the iterator type of a range.
Definition: ranges:225
typename transformation_trait_or< type_t, default_t >::type transformation_trait_or_t
Helper type of seqan3::detail::transformation_trait_or.
Definition: transformation_trait_or.hpp:79
The concept std::Same<T, U> is satisfied if and only if T and U denote the same type.
Provides an approximate string matching algorithm based on simple backtracking. This should only be u...
Definition: aligned_sequence_concept.hpp:288
Provides C++20 additions to the type_traits header.
Provides data structures used by different search algorithms.