SeqAn3
search.hpp
Go to the documentation of this file.
1 // ============================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ============================================================================
4 //
5 // Copyright (c) 2006-2018, Knut Reinert & Freie Universitaet Berlin
6 // Copyright (c) 2016-2018, Knut Reinert & MPI Molekulare Genetik
7 // All rights reserved.
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are met:
11 //
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
14 // * Redistributions in binary form must reproduce the above copyright
15 // notice, this list of conditions and the following disclaimer in the
16 // documentation and/or other materials provided with the distribution.
17 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
18 // its contributors may be used to endorse or promote products derived
19 // from this software without specific prior written permission.
20 //
21 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
25 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
31 // DAMAGE.
32 //
33 // ============================================================================
34 
40 #pragma once
41 
46 
47 namespace seqan3::detail
48 {
49 
71 template <typename index_t, typename query_t, typename configuration_t>
72 inline auto search_single(index_t const & index, query_t & query, configuration_t const & cfg)
73 {
74  // retrieve error numbers / rates
75  detail::search_param max_error{0, 0, 0, 0};
76  auto & [total, subs, ins, del] = max_error;
77  if constexpr (contains<search_cfg::id::max_error>(cfg))
78  {
79  std::tie(total, subs, ins, del) = get<search_cfg::id::max_error>(cfg);
80  }
81  else if constexpr (contains<search_cfg::id::max_error_rate>(cfg))
82  {
83  // NOTE: Casting doubles rounds towards zero (i.e. floor for positive numbers). Thus, given a rate of
84  // 10% and a read length of 101 the max number of errors is correctly casted from 10.1 errors to 10
85  std::tie(total, subs, ins, del) = std::apply([& query] (auto && ... args)
86  {
87  return std::tuple{(args * query.size())...};
88  }, get<search_cfg::id::max_error_rate>(cfg));
89  }
90 
91  // TODO: if total not set: max_error.total = max_error.deletion + max_error.substitution + max_error.insertion;
92  // TODO: throw exception when any error number or rate is higher than the total error number/rate
93  // throw std::invalid_argument("The total number of errors is set to zero while there is a positive number"
94  // " of errors for a specific error type.");
95 
96  // construct internal delegate for collecting hits for later filtering (if necessary)
97  std::vector<typename index_t::iterator_type> internal_hits;
98  auto internal_delegate = [&internal_hits, &max_error] (auto const & it)
99  {
100  internal_hits.push_back(it);
101  };
102 
103  // choose mode
104  auto const & selected_mode = seqan3::get<search_cfg::id::mode>(cfg);
105  if constexpr (std::Same<remove_cvref_t<decltype(selected_mode)>, detail::search_mode_best>)
106  {
107  detail::search_param max_error2{max_error};
108  max_error2.total = 0;
109  while (internal_hits.empty() && max_error2.total <= max_error.total)
110  {
111  detail::search_algo<true>(index, query, max_error2, internal_delegate);
112  max_error2.total++;
113  }
114  }
115  else if constexpr (std::Same<remove_cvref_t<decltype(selected_mode)>, detail::search_mode_all_best>)
116  {
117  detail::search_param max_error2{max_error};
118  max_error2.total = 0;
119  while (internal_hits.empty() && max_error2.total <= max_error.total)
120  {
121  detail::search_algo<false>(index, query, max_error2, internal_delegate);
122  max_error2.total++;
123  }
124  }
125  else if constexpr (std::Same<remove_cvref_t<decltype(selected_mode)>, search_cfg::strata>)
126  {
127  detail::search_param max_error2{max_error};
128  max_error2.total = 0;
129  while (internal_hits.empty() && max_error2.total <= max_error.total)
130  {
131  detail::search_algo<true>(index, query, max_error2, internal_delegate);
132  max_error2.total++;
133  }
134  if (!internal_hits.empty())
135  {
136  internal_hits.clear(); // TODO: don't clear when using Optimum Search Schemes with lower error bounds
137  uint8_t const s = selected_mode;
138  max_error2.total += s - 1;
139  detail::search_algo<false>(index, query, max_error2, internal_delegate);
140  }
141  }
142  else // detail::search_mode_all
143  {
144  detail::search_algo<false>(index, query, max_error, internal_delegate);
145  }
146 
147  // TODO: filter hits and only do it when necessary (depending on error types)
148 
149  // output iterators or text_positions
150  auto const & output = seqan3::get<search_cfg::id::output>(cfg);
151  if constexpr (std::Same<remove_cvref_t<decltype(output)>, detail::search_output_index_iterator>)
152  {
153  return internal_hits;
154  }
155  else
156  {
157  std::vector<typename index_t::size_type> hits;
158  auto const & selected_mode = seqan3::get<search_cfg::id::mode>(cfg);
159  if constexpr (std::Same<remove_cvref_t<decltype(selected_mode)>, detail::search_mode_best>)
160  {
161  // only one iterator is reported but it might contain more than one text position
162  if (!internal_hits.empty())
163  {
164  auto text_pos = internal_hits[0].lazy_locate();
165  hits.push_back(text_pos[0]);
166  }
167  }
168  else
169  {
170  for (auto const & it : internal_hits)
171  {
172  for (auto const & text_pos : it.locate())
173  hits.push_back(text_pos);
174  std::sort(hits.begin(), hits.end());
175  hits.erase(std::unique(hits.begin(), hits.end()), hits.end());
176  }
177  }
178  return hits;
179  }
180 }
181 
200 template <typename index_t, typename queries_t, typename configuration_t>
201 inline auto search_all(index_t const & index, queries_t & queries, configuration_t const & cfg)
202 {
203  // return type: for each query: a vector of text_positions (or iterators)
204  // delegate params: text_position (or iterator). we will withhold all hits of one query anyway to filter
205  // duplicates. more efficient to call delegate once with one vector instead of calling
206  // delegate for each hit separately at once.
207  auto const & output = seqan3::get<search_cfg::id::output>(cfg);
208  using hit_t = std::conditional_t<std::Same<remove_cvref_t<decltype(output)>, detail::search_output_index_iterator>,
209  typename index_t::iterator_type,
210  typename index_t::size_type>;
211 
212  if constexpr (std::ranges::ForwardRange<queries_t> && std::ranges::RandomAccessRange<value_type_t<queries_t>>)
213  {
214  // TODO: if constexpr (contains<search_cfg::id::on_hit>(cfg))
215  std::vector<std::vector<hit_t>> hits;
216  hits.reserve(std::distance(queries.begin(), queries.end()));
217  for (auto const query : queries)
218  {
219  hits.push_back(search_single(index, query, cfg));
220  }
221  return hits;
222  }
223  else // std::ranges::RandomAccessRange<queries_t>
224  {
225  // TODO: if constexpr (contains<search_cfg::id::on_hit>(cfg))
226  return search_single(index, queries, cfg);
227  }
228 }
229 
231 
232 } // namespace seqan3::detail
constexpr detail::align_config_output_adaptor< e > output
A configuration adaptor for alignment output.
Definition: align_config_output.hpp:114
Provides the algorithm to search in an index using search schemes.
Provides the concepts for seqan3::fm_index and seqan3::bi_fm_index and its traits and iterators...
constexpr detail::align_config_max_error_adaptor max_error
A configuration adaptor for maximal errors.
Definition: align_config_max_error.hpp:110
Provides various metafunctions base templates and shortcuts.
std::remove_cv_t< std::remove_reference_t< t > > remove_cvref_t
Return the input type with const, volatile and references removed [Type metafunction].
Definition: basic.hpp:64
Specifies requirements of a Range type for which begin returns a type that models std::RandomAccessIt...
The concept std::Same<T, U> is satisfied if and only if T and U denote the same type.
Provides an approximate string matching algorithm based on simple backtracking. This should only be u...
Specifies requirements of a Range type for which begin returns a type that models std::ForwardIterato...
Definition: aligned_sequence_concept.hpp:288