SeqAn3
fm_index.hpp
Go to the documentation of this file.
1 // ============================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ============================================================================
4 //
5 // Copyright (c) 2006-2018, Knut Reinert & Freie Universitaet Berlin
6 // Copyright (c) 2016-2018, Knut Reinert & MPI Molekulare Genetik
7 // All rights reserved.
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are met:
11 //
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
14 // * Redistributions in binary form must reproduce the above copyright
15 // notice, this list of conditions and the following disclaimer in the
16 // documentation and/or other materials provided with the distribution.
17 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
18 // its contributors may be used to endorse or promote products derived
19 // from this software without specific prior written permission.
20 //
21 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
25 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
31 // DAMAGE.
32 //
33 // ============================================================================
34 
40 #pragma once
41 
42 #include <sdsl/suffix_trees.hpp>
43 
44 #include <range/v3/algorithm/copy.hpp>
45 
47 #include <seqan3/io/filesystem.hpp>
56 #include <seqan3/std/ranges>
57 
58 namespace seqan3
59 {
60 
62 // forward declarations
63 template <typename index_t>
64 class fm_index_iterator;
65 
66 template <typename index_t>
67 class bi_fm_index_iterator;
69 
88 {
90  using sdsl_index_type = sdsl::csa_wt<
91  sdsl::wt_blcd<
92  sdsl::bit_vector,
93  sdsl::rank_support_v<>,
94  sdsl::select_support_scan<>,
95  sdsl::select_support_scan<0>
96  >,
97  16,
98  10000000,
99  sdsl::sa_order_sa_sampling<>,
100  sdsl::isa_sampling<>,
102  >;
103 };
104 
129 template <std::ranges::RandomAccessRange text_t, fm_index_traits_concept fm_index_traits = fm_index_default_traits>
134 class fm_index
135 {
136 protected:
138 
142  using sdsl_index_type = typename fm_index_traits::sdsl_index_type;
147  using sdsl_char_type = typename sdsl_index_type::alphabet_type::char_type;
149 
151  sdsl_index_type index;
153  text_t const * text = nullptr;
154 
155 public:
159  using text_type = text_t;
164  using size_type = typename sdsl_index_type::size_type;
168 
169  template <typename bi_fm_index_t>
170  friend class bi_fm_index_iterator;
171 
172  template <typename fm_index_t>
173  friend class fm_index_iterator;
174 
175  template <typename fm_index_t>
176  friend class detail::fm_index_iterator_node;
177 
181  fm_index() = default;
182  fm_index(fm_index const &) = default;
183  fm_index & operator=(fm_index const &) = default;
184  fm_index(fm_index &&) = default;
185  fm_index & operator=(fm_index &&) = default;
186  ~fm_index() = default;
187 
197  fm_index(text_t const & text)
198  {
199  construct(text);
200  }
201 
203  fm_index(text_t &&) = delete;
204 
206  fm_index(text_t const &&) = delete;
208 
225  void construct(text_t const & text)
226  {
227  // text must not be empty
228  if (text.begin() == text.end())
229  throw std::invalid_argument("The text that is indexed cannot be empty.");
230  this->text = &text;
231  // TODO:
232  // * check what happens in sdsl when constructed twice!
233  // * choose between in-memory/external and construction algorithms
234  // * sdsl construction currently only works for int_vector, std::string and char *, not ranges in general
235  // uint8_t largest_char = 0;
236  sdsl::int_vector<8> tmp_text(text.size());
237 
238  //TODO view::reverse is broken and can't be chained right now
239  std::vector<value_type_t<text_t>> another_copy = text | view::reverse;
240 
241  std::ranges::copy(another_copy | view::to_rank | view::transform([] (uint8_t const r) { return r + 1; }),
242  seqan3::begin(tmp_text)); // reverse and increase rank by one
243 
244  sdsl::construct_im(index, tmp_text, 0);
245 
246  // TODO: would be nice but doesn't work since it's private and the public member references are const
247  // index.m_C.resize(largest_char);
248  // index.m_C.shrink_to_fit();
249  // index.m_sigma = largest_char;
250  }
251 
253  void construct(text_t &&) = delete;
254 
256  void construct(text_t const &&) = delete;
257 
269  size_type size() const noexcept
270  {
271  return index.size();
272  }
273 
285  bool empty() const noexcept
286  {
287  return size() == 0;
288  }
289 
290  // operator== not implemented by sdsl indices yet
291  // bool operator==(fm_index const & rhs) const noexcept
292  // {
293  // return index == rhs.index;
294  // }
295 
296  // operator== not implemented by sdsl indices yet
297  // bool operator!=(fm_index const & rhs) const noexcept
298  // {
299  // return !(*this == rhs);
300  // }
301 
316  iterator_type begin() const noexcept
317  {
318  return {*this};
319  }
320 
333  bool load(filesystem::path const & path)
334  {
335  sdsl_index_type tmp;
336  if (sdsl::load_from_file(tmp, path))
337  {
338  std::swap(this->index, tmp);
339  return true;
340  }
341  return false;
342  }
343 
356  bool store(filesystem::path const & path) const
357  {
358  return sdsl::store_to_file(index, path);
359  }
360 
361 };
362 
364 
365 } // namespace seqan3
Provides seqan3::view::reverse.
Provides the internal representation of a node of the seqan3::fm_index_iterator.
constexpr auto transform
A range adaptor that takes a invocable and returns a view of the elements with the invocable applied...
Definition: transform.hpp:95
innermost_value_type_t< text_t > char_type
The type of the underlying character of text_type.
Definition: fm_index.hpp:162
typename innermost_value_type< t >::type innermost_value_type_t
Shortcut for seqan3::innermost_value_type.
Definition: range.hpp:213
bool load(filesystem::path const &path)
Loads the index from disk. Temporary function until cereal is supported.
Definition: fm_index.hpp:333
typename sdsl_index_type::size_type size_type
Type for representing positions in the indexed text.
Definition: fm_index.hpp:164
iterator_type begin() const noexcept
Returns a seqan3::fm_index_iterator on the index that can be used for searching.
Definition: fm_index.hpp:316
Provides the concepts for seqan3::fm_index and seqan3::bi_fm_index and its traits and iterators...
void construct(text_t const &text)
Constructs the index given a range. The range cannot be an rvalue (i.e. a temporary object) and has t...
Definition: fm_index.hpp:225
Contains various shortcuts for common std::ranges functions.
Provides an alphabet mapping that implements an identity map (i.e. each character is mapped to its ra...
::ranges::copy copy
Alias for ranges::copy. Copies a range of elements to a new location.
Definition: ranges:200
::ranges::size size
Alias for ranges::size. Obtains the size of a range whose size can be calculated in constant time...
Definition: ranges:195
The generic alphabet concept that covers most data types used in ranges.This is the core alphabet con...
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:58
bool store(filesystem::path const &path) const
Stores the index to disk. Temporary function until cereal is supported.
Definition: fm_index.hpp:356
The SeqAn FM Index Iterator.
Definition: fm_index_iterator.hpp:91
fm_index(text_t const &text)
Constructor that immediately constructs the index given a range. The range cannot be an rvalue (i...
Definition: fm_index.hpp:197
Provides seqan3::view::to_rank.
The default FM Index Configuration.
Definition: fm_index.hpp:87
Adaptations of concepts from the Ranges TS.
The concept std::Same<T, U> is satisfied if and only if T and U denote the same type.
size_type size() const noexcept
Returns the length of the indexed text including sentinel characters.
Definition: fm_index.hpp:269
Provides the seqan3::fm_index_iterator for searching in the unidirectional seqan3::fm_index.
sdsl::csa_wt< sdsl::wt_blcd< sdsl::bit_vector, sdsl::rank_support_v<>, sdsl::select_support_scan<>, sdsl::select_support_scan< 0 > >, 16, 10000000, sdsl::sa_order_sa_sampling<>, sdsl::isa_sampling<>, sdsl::plain_byte_alphabet > sdsl_index_type
Type of the underlying SDSL index.
Definition: fm_index.hpp:102
This header includes C++17 filesystem support and imports it into namespace seqan3::filesystem (indep...
Byte alphabet that does no mapping of char_type to comp_char_type and vice versa. ...
Definition: csa_alphabet_strategy.hpp:62
Provides seqan3::view::transform.
bool empty() const noexcept
Checks whether the index is empty.
Definition: fm_index.hpp:285
The SeqAn Bidirectional FM Index Iterator.
Definition: bi_fm_index_iterator.hpp:83
Provides various metafunctions used by the range module.
constexpr auto reverse
A range adaptor that presents the underlying range in reverse order.
Definition: reverse.hpp:93
text_t text_type
The type of the indexed text.
Definition: fm_index.hpp:160
auto const to_rank
A view that calls seqan3::to_rank() on each element in the input range.
Definition: to_rank.hpp:90
The SeqAn FM Index.
Definition: fm_index.hpp:134