SeqAn3
fm_index_iterator.hpp
Go to the documentation of this file.
1 // ============================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ============================================================================
4 //
5 // Copyright (c) 2006-2018, Knut Reinert & Freie Universitaet Berlin
6 // Copyright (c) 2016-2018, Knut Reinert & MPI Molekulare Genetik
7 // All rights reserved.
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are met:
11 //
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
14 // * Redistributions in binary form must reproduce the above copyright
15 // notice, this list of conditions and the following disclaimer in the
16 // documentation and/or other materials provided with the distribution.
17 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
18 // its contributors may be used to endorse or promote products derived
19 // from this software without specific prior written permission.
20 //
21 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
25 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
31 // DAMAGE.
32 //
33 // ============================================================================
34 
40 #pragma once
41 
42 #include <array>
43 #include <type_traits>
44 
45 #include <sdsl/suffix_trees.hpp>
46 
47 #include <range/v3/view/iota.hpp>
48 #include <range/v3/view/slice.hpp>
49 
50 #include <seqan3/alphabet/all.hpp>
56 
57 // namespace seqan3::detail
58 // {
59 // // forward declaration
60 // auto get_suffix_array_range(fm_index_iterator<index_t> const & it);
61 // } // namespace seqan3::detail
62 
63 namespace seqan3
64 {
65 
90 template <typename index_t>
92 {
93 
94 public:
95 
99  using index_type = index_t;
102  using size_type = typename index_type::size_type;
104 
105 protected:
107 
111  using node_type = detail::fm_index_iterator_node<index_t>;
114  using sdsl_char_type = typename index_type::sdsl_char_type;
116 
118  index_type const * index;
120  size_type parent_lb;
122  size_type parent_rb;
124  node_type node;
125 
126  template <typename _index_t>
127  friend class bi_fm_index_iterator;
128 
129  // friend detail::get_suffix_array_range;
130 
132  size_type offset() const noexcept
133  {
134  assert(index->index.size() > query_length());
135  return index->index.size() - query_length() - 1; // since the string is reversed during construction
136  }
137 
139  template <detail::sdsl_index_concept csa_t>
140  bool backward_search(csa_t const & csa, sdsl_char_type const c, size_type & l, size_type & r) const noexcept
141  {
142  assert(l <= r && r < csa.size());
143 
144  size_type _l, _r;
145 
146  size_type cc = c;
148  {
149  cc = csa.char2comp[c];
150  if (cc == 0 && c > 0) // [[unlikely]]
151  return false;
152  }
153 
154  size_type const c_begin = csa.C[cc];
155  if (l == 0 && r + 1 == csa.size()) // [[unlikely]]
156  {
157  _l = c_begin;
158  _r = csa.C[cc + 1] - 1;
159  // if we use not the plain_byte_alphabet, we could return always return true here
160  }
161  else
162  {
163  _l = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1]
164  _r = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r]
165  }
166 
167  if (_r >= _l)
168  {
169  r = _r;
170  l = _l;
171  assert(r + 1 - l >= 0);
172  return true;
173  }
174  return false;
175  }
176 
177 public:
178 
182  // Default construction is necessary to make this class semi-regular and e.g., to allow construction of
184  // std::array of iterators.
185  fm_index_iterator() noexcept = default;
186  fm_index_iterator(fm_index_iterator const &) noexcept = default;
187  fm_index_iterator & operator=(fm_index_iterator const &) noexcept = default;
188  fm_index_iterator(fm_index_iterator &&) noexcept = default;
189  fm_index_iterator & operator=(fm_index_iterator &&) noexcept = default;
190 
191  fm_index_iterator(index_t const & _index) noexcept : index(&_index), node({0, _index.index.size() - 1, 0, 0})
192  {}
193  //\}
194 
207  bool operator==(fm_index_iterator const & rhs) const noexcept
208  {
209  assert(index != nullptr);
210  assert(node != rhs.node || (query_length() == 0 || (parent_lb == rhs.parent_lb && parent_rb == rhs.parent_rb)));
211 
212  // position in the implicit suffix tree is defined by the SA interval and depth.
213  // No need to compare parent intervals
214  return node == rhs.node;
215  }
216 
229  bool operator!=(fm_index_iterator const & rhs) const noexcept
230  {
231  assert(index != nullptr);
232 
233  return !(*this == rhs);
234  }
235 
253  bool extend_right() noexcept
254  {
255  // TODO: specialize extend_right() and cycle_back() for EPR-dictionaries
256  // store all iterators at once in a private std::array of iterators
257  assert(index != nullptr);
258 
259  sdsl_char_type c = 1; // NOTE: start with 0 or 1 depending on implicit_sentintel
260  size_type _lb = node.lb, _rb = node.rb;
261  while (c < index->index.sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
262  {
263  ++c;
264  }
265 
266  if (c != index->index.sigma)
267  {
268  parent_lb = node.lb;
269  parent_rb = node.rb;
270  node = {_lb, _rb, node.depth + 1, c};
271  return true;
272  }
273  return false;
274  }
275 
290  template <alphabet_concept char_t>
294  bool extend_right(char_t const c) noexcept
295  {
296  assert(index != nullptr);
297 
298  size_type _lb = node.lb, _rb = node.rb;
299 
300  sdsl_char_type c_char = to_rank(c) + 1;
301 
302  if (backward_search(index->index, c_char, _lb, _rb))
303  {
304  parent_lb = node.lb;
305  parent_rb = node.rb;
306  node = {_lb, _rb, node.depth + 1, c_char};
307  return true;
308  }
309  return false;
310  }
311 
328  template <std::ranges::RandomAccessRange seq_t>
330  requires implicitly_convertible_to_concept<innermost_value_type_t<seq_t>, typename index_t::char_type>
332  bool extend_right(seq_t && seq) noexcept
333  {
334  auto first = seq.begin();
335  auto last = seq.end();
336 
337  assert(index != nullptr); // range must not be empty!
338 
339  size_type _lb = node.lb, _rb = node.rb;
340  size_type new_parent_lb = parent_lb, new_parent_rb = parent_rb;
341 
342  sdsl_char_type c{};
343 
344  for (auto it = first; it != last; ++it)
345  {
346  c = to_rank(*it) + 1;
347 
348  new_parent_lb = _lb;
349  new_parent_rb = _rb;
350  if (!backward_search(index->index, c, _lb, _rb))
351  return false;
352  }
353 
354  parent_lb = new_parent_lb;
355  parent_rb = new_parent_rb;
356  node = {_lb, _rb, last - first + node.depth, c};
357  return true;
358  }
359 
386  bool cycle_back() noexcept
387  {
388  assert(index != nullptr && query_length() > 0);
389  // parent_lb > parent_rb --> invalid interval
390  assert(parent_lb <= parent_rb);
391 
392  sdsl_char_type c = node.last_char + 1;
393  size_type _lb = parent_lb, _rb = parent_rb;
394 
395  while (c < index->index.sigma && !backward_search(index->index, index->index.comp2char[c], _lb, _rb))
396  {
397  ++c;
398  }
399 
400  if (c != index->index.sigma)
401  {
402  node = {_lb, _rb, node.depth, c};
403  return true;
404  }
405  return false;
406  }
407 
423  typename index_t::char_type last_char() noexcept
424  {
425  // parent_lb > parent_rb --> invalid interval
426  assert(index != nullptr && query_length() > 0 && parent_lb <= parent_rb);
427 
428  typename index_t::char_type c;
429  assign_rank(c, index->index.comp2char[node.last_char] - 1); // text is not allowed to contain ranks of 0
430  return c;
431  }
432 
447  size_type query_length() const noexcept
448  {
449  assert(index != nullptr);
450  assert(node.depth != 0 || (node.lb == 0 && node.rb == index->size() - 1)); // depth == 0 -> root node
451 
452  return node.depth;
453  }
454 
469  auto query() const noexcept
470  {
471  assert(index != nullptr && index->text != nullptr);
472 
473  size_type const query_begin = offset() - index->index[node.lb];
474  return *index->text | ranges::view::slice(query_begin, query_begin + query_length());
475  }
476 
478  auto operator*() const noexcept
479  {
480  assert(index != nullptr && index->text != nullptr);
481 
482  return query();
483  }
484 
496  size_type count() const noexcept
497  {
498  assert(index != nullptr);
499 
500  return 1 + node.rb - node.lb;
501  }
502 
514  std::vector<size_type> locate() const
515  {
516  assert(index != nullptr);
517 
518  std::vector<size_type> occ(count());
519  for (size_type i = 0; i < occ.size(); ++i)
520  {
521  occ[i] = offset() - index->index[node.lb + i];
522  }
523  return occ;
524  }
525 
538  auto lazy_locate() const
539  {
540  assert(index != nullptr);
541 
542  return ranges::view::iota(node.lb, node.lb + count())
543  | view::transform([*this, _offset = offset()] (auto sa_pos) { return _offset - index->index[sa_pos]; });
544  }
545 
546 };
547 
549 
550 } // namespace seqan3
constexpr simd_t iota(typename simd_traits< simd_t >::scalar_type const offset)
Fills a seqan3::simd::simd_type vector with the scalar values offset, offset+1, offset+2, ...
Definition: simd_algorithm.hpp:100
Provides the internal representation of a node of the seqan3::fm_index_iterator.
constexpr auto transform
A range adaptor that takes a invocable and returns a view of the elements with the invocable applied...
Definition: transform.hpp:95
bool cycle_back() noexcept
Tries to replace the rightmost character of the query by the next lexicographically larger character ...
Definition: fm_index_iterator.hpp:386
bool extend_right(seq_t &&seq) noexcept
Tries to extend the query by seq to the right.
Definition: fm_index_iterator.hpp:332
auto lazy_locate() const
Locates the occurrences of the searched query in the text on demand, i.e. a ranges::view is returned ...
Definition: fm_index_iterator.hpp:538
bool extend_right(char_t const c) noexcept
Tries to extend the query by the character c to the right.
Definition: fm_index_iterator.hpp:294
Provides an alphabet mapping that implements an identity map (i.e. each character is mapped to its ra...
size_type query_length() const noexcept
Returns the length of the searched query.
Definition: fm_index_iterator.hpp:447
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:58
The SeqAn FM Index Iterator.
Definition: fm_index_iterator.hpp:91
bool extend_right() noexcept
Tries to extend the query by the smallest possible character to the right such that the query is foun...
Definition: fm_index_iterator.hpp:253
auto query() const noexcept
Returns the searched query.
Definition: fm_index_iterator.hpp:469
auto operator*() const noexcept
Returns the searched query.
Definition: fm_index_iterator.hpp:478
index_t index_type
Type of the index.
Definition: fm_index_iterator.hpp:100
constexpr alphabet_type & assign_rank(alphabet_type &alph, underlying_rank_t< alphabet_type > const rank) requires requires(alphabet_type alph)
Implementation of seqan3::semi_alphabet_concept::assign_rank() that delegates to a member function...
Definition: member_exposure.hpp:110
index_t::char_type last_char() noexcept
Outputs the rightmost character.
Definition: fm_index_iterator.hpp:423
Meta-header for the alphabet module.
bool operator==(fm_index_iterator const &rhs) const noexcept
Compares two iterators.
Definition: fm_index_iterator.hpp:207
The concept std::Same<T, U> is satisfied if and only if T and U denote the same type.
typename index_type::size_type size_type
Type for representing positions in the indexed text.
Definition: fm_index_iterator.hpp:102
Provides seqan3::view::transform.
Provides C++20 additions to the type_traits header.
The SeqAn Bidirectional FM Index Iterator.
Definition: bi_fm_index_iterator.hpp:83
std::vector< size_type > locate() const
Locates the occurrences of the searched query in the text.
Definition: fm_index_iterator.hpp:514
Resolves to std::ranges::ImplicitlyConvertibleTo<type1, type2>().
bool operator!=(fm_index_iterator const &rhs) const noexcept
Compares two iterators.
Definition: fm_index_iterator.hpp:229
Provides various metafunctions used by the range module.
Provides the unidirectional seqan3::fm_index.
constexpr underlying_rank_t< alphabet_type > to_rank(alphabet_type const alph) requires requires(alphabet_type alph)
Implementation of seqan3::semi_alphabet_concept::to_rank() that delegates to a member function...
Definition: member_exposure.hpp:97
size_type count() const noexcept
Counts the number of occurrences of the searched query in the text.
Definition: fm_index_iterator.hpp:496
fm_index_iterator() noexcept=default
Default constructor. Accessing member functions on a default constructed object is undefined behavior...