SeqAn3
csa_alphabet_strategy.hpp
Go to the documentation of this file.
1 // Copyright (c) 2018, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
4 
5 // ============================================================================
6 // SeqAn - The Library for Sequence Analysis
7 // ============================================================================
8 //
9 // Copyright (c) 2006-2018, Knut Reinert & Freie Universitaet Berlin
10 // Copyright (c) 2016-2018, Knut Reinert & MPI Molekulare Genetik
11 // All rights reserved.
12 //
13 // Redistribution and use in source and binary forms, with or without
14 // modification, are permitted provided that the following conditions are met:
15 //
16 // * Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 // * Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
22 // its contributors may be used to endorse or promote products derived
23 // from this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
26 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
29 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
31 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
32 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
35 // DAMAGE.
36 //
37 // ============================================================================
38 
47 #pragma once
48 
49 #include <string>
50 
51 #include <sdsl/config.hpp>
52 #include <sdsl/int_vector.hpp>
53 #include <sdsl/rank_support.hpp>
54 #include <sdsl/sdsl_concepts.hpp>
55 #include <sdsl/select_support.hpp>
56 
57 namespace sdsl
58 {
59 
61  // This is recommended when the underlying text uses the entire alphabet and not just a small subset.
63  {
65  public:
66  class mapping_wrapper;
67 
68  typedef int_vector<>::size_type size_type;
69  typedef mapping_wrapper char2comp_type;
70  typedef mapping_wrapper comp2char_type;
71  typedef int_vector<64> C_type;
72  typedef uint16_t sigma_type;
73  typedef uint8_t char_type;
74  typedef uint8_t comp_char_type;
75  typedef std::string string_type;
76  typedef byte_alphabet_tag alphabet_category;
77  enum { int_width = 8 };
78 
80  class mapping_wrapper
81  {
82  public:
83  mapping_wrapper() {}
84 
85  constexpr char_type operator[](char_type const c) const noexcept
86  {
87  return c;
88  }
89  };
90 
91  const char2comp_type char2comp;
92  const comp2char_type comp2char;
93  const C_type & C;
94  const sigma_type & sigma;
95 
96  private:
97  C_type m_C; // Cumulative counts for the compact alphabet [0..sigma].
98  sigma_type m_sigma; // Effective size of the alphabet.
99 
100  public:
102  plain_byte_alphabet() : C(m_C), sigma(m_sigma), m_sigma(0)
103  {}
104 
109  plain_byte_alphabet(int_vector_buffer<8> & text_buf, int_vector_size_type len) : C(m_C), sigma(m_sigma)
110  {
111  m_sigma = 0;
112  if (0 == len || 0 == text_buf.size())
113  return;
114 
115  assert(len <= text_buf.size());
116 
117  // initialize vectors
118  m_C = int_vector<64>(257, 0);
119  // count occurrences of each symbol
120  for (size_type i = 0; i < len; ++i)
121  ++m_C[text_buf[i]];
122 
123  assert(1 == m_C[0]); // null-byte should occur exactly once
124 
125  m_sigma = 255;
126  for (int i = 0; i < 256; ++i)
127  {
128  if (m_C[i])
129  {
130  m_sigma = i + 1;
131  // m_C[m_sigma] = m_C[i];
132  // ++m_sigma;
133  }
134  }
135  // m_C.resize(m_sigma + 1);
136  for (int i = (int) 256; i > 0; --i)
137  m_C[i] = m_C[i - 1];
138  m_C[0] = 0;
139  for (int i = 1; i <= (int) 256; ++i)
140  m_C[i] += m_C[i - 1];
141 
142  assert(C[sigma] == len);
143  }
144 
145  plain_byte_alphabet(plain_byte_alphabet const & strat) : C(m_C),
146  sigma(m_sigma),
147  m_C(strat.m_C),
148  m_sigma(strat.m_sigma)
149  {}
150 
151  plain_byte_alphabet(plain_byte_alphabet && strat) : C(m_C),
152  sigma(m_sigma),
153  m_C(std::move(strat.m_C)),
154  m_sigma(strat.m_sigma)
155  {}
156 
157  plain_byte_alphabet & operator=(plain_byte_alphabet const & strat)
158  {
159  if (this != &strat)
160  {
161  plain_byte_alphabet tmp(strat);
162  *this = std::move(tmp);
163  }
164  return *this;
165  }
166 
167  plain_byte_alphabet & operator=(plain_byte_alphabet && strat)
168  {
169  if (this != &strat)
170  {
171  m_C = std::move(strat.m_C);
172  m_sigma = std::move(strat.m_sigma);
173  }
174  return *this;
175  }
176 
177  size_type serialize(std::ostream & out, structure_tree_node * v, std::string name = "") const
178  {
179  structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
180  size_type written_bytes = 0;
181  written_bytes += m_C.serialize(out, child, "m_C");
182  written_bytes += write_member(m_sigma, out, child, "m_sigma");
183  structure_tree::add_size(child, written_bytes);
184  return written_bytes;
185  }
186 
187  void load(std::istream & in)
188  {
189  m_C.load(in);
190  read_member(m_sigma, in);
191  }
193  };
194 
195 }
Byte alphabet that does no mapping of char_type to comp_char_type and vice versa. ...
Definition: csa_alphabet_strategy.hpp:62
Definition: csa_alphabet_strategy.hpp:57