SeqAn3
format_fasta.hpp
Go to the documentation of this file.
1 // ============================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ============================================================================
4 //
5 // Copyright (c) 2006-2018, Knut Reinert & Freie Universitaet Berlin
6 // Copyright (c) 2016-2018, Knut Reinert & MPI Molekulare Genetik
7 // All rights reserved.
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are met:
11 //
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
14 // * Redistributions in binary form must reproduce the above copyright
15 // notice, this list of conditions and the following disclaimer in the
16 // documentation and/or other materials provided with the distribution.
17 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
18 // its contributors may be used to endorse or promote products derived
19 // from this software without specific prior written permission.
20 //
21 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
25 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
31 // DAMAGE.
32 //
33 // ============================================================================
34 
40 #pragma once
41 
42 #include <iterator>
43 #include <string>
44 #include <string_view>
45 #include <vector>
46 
47 #include <range/v3/algorithm/copy.hpp>
48 #include <range/v3/utility/iterator.hpp>
49 #include <range/v3/view/chunk.hpp>
50 #include <range/v3/view/drop_while.hpp>
51 #include <range/v3/view/join.hpp>
52 #include <range/v3/view/remove_if.hpp>
53 #include <range/v3/view/take_while.hpp>
54 
71 #include <seqan3/std/ranges>
74 
75 namespace seqan3
76 {
110 {
111 public:
116  sequence_file_format_fasta() = default;
118  sequence_file_format_fasta & operator=(sequence_file_format_fasta const &) = delete;
120  sequence_file_format_fasta & operator=(sequence_file_format_fasta &&) = default;
121  ~sequence_file_format_fasta() = default;
123 
125  static inline std::vector<std::string> file_extensions
126  {
127  { "fasta" },
128  { "fa" },
129  { "fna" },
130  { "ffn" },
131  { "faa" },
132  { "frn" },
133  };
134 
136  template <typename stream_type, // constraints checked by file
137  typename seq_legal_alph_type, bool seq_qual_combined,
138  typename seq_type, // other constraints checked inside function
139  typename id_type,
140  typename qual_type>
141  void read(stream_type & stream,
143  seq_type & sequence,
144  id_type & id,
145  qual_type & SEQAN3_DOXYGEN_ONLY(qualities))
146  {
147  auto stream_view = view::subrange<decltype(std::istreambuf_iterator<char>{stream}),
148  decltype(std::istreambuf_iterator<char>{})>
149  {std::istreambuf_iterator<char>{stream},
150  std::istreambuf_iterator<char>{}};
151  // ID
152  read_id(stream_view, options, id);
153 
154  // Sequence
155  read_seq(stream_view, options, sequence);
156 
157  // make sure "buffer at end" implies "stream at end"
158  if ((std::istreambuf_iterator<char>{stream} == std::istreambuf_iterator<char>{}) &&
159  (!stream.eof()))
160  {
161  stream.get(); // triggers error in stream and sets eof
162  }
163  }
164 
166  template <typename stream_type, // constraints checked by file
167  typename seq_type, // other constraints checked inside function
168  typename id_type,
169  typename qual_type>
170  void write(stream_type & stream,
171  sequence_file_output_options const & options,
172  seq_type && sequence,
173  id_type && id,
174  qual_type && SEQAN3_DOXYGEN_ONLY(qualities))
175  {
176 
177  std::ranges::ostreambuf_iterator stream_it{stream};
178 
179  // ID
180  if constexpr (detail::decays_to_ignore_v<id_type>)
181  {
182  throw std::logic_error{"The ID field may not be set to ignore when writing FASTA files."};
183  }
184  else
185  {
186  if (empty(id)) //[[unlikely]]
187  throw std::runtime_error{"The ID field may not be empty when writing FASTA files."};
188 
189  write_id(stream_it, options, id);
190  }
191 
192  // Sequence
193  if constexpr (detail::decays_to_ignore_v<seq_type>) // sequence
194  {
195  throw std::logic_error{"The SEQ and SEQ_QUAL fields may not both be set to ignore when writing FASTA files."};
196  }
197  else
198  {
199  if (empty(sequence)) //[[unlikely]]
200  throw std::runtime_error{"The SEQ field may not be empty when writing FASTA files."};
201 
202  write_seq(stream_it, options, sequence);
203  }
204  }
205 
206 protected:
209  template <typename stream_view_t,
210  typename seq_legal_alph_type, bool seq_qual_combined,
211  typename id_type>
212  void read_id(stream_view_t & stream_view,
214  id_type & id)
215  {
216  auto const is_id = is_char<'>'> || is_char<';'>;
217 
218  if (!is_id(*begin(stream_view)))
219  throw parse_error{std::string{"Expected to be on beginning of ID, but "} + is_id.msg.string() +
220  " evaluated to false on " + detail::make_printable(*begin(stream_view))};
221 
222  // read id
223  if constexpr (!detail::decays_to_ignore_v<id_type>)
224  {
225  if (options.truncate_ids)
226  {
227  std::ranges::copy(stream_view | ranges::view::drop_while(is_id || is_blank) // skip leading >
228  | view::take_until_or_throw(is_cntrl || is_blank) // read ID until delimiter…
229  | view::char_to<value_type_t<id_type>>,
230  std::back_inserter(id)); // … ^A is old delimiter
231 
232  // consume rest of line
233  detail::consume(stream_view | view::take_line_or_throw);
234  }
235  else
236  {
237  std::ranges::copy(stream_view | view::take_line_or_throw // read line
238  | ranges::view::drop_while(is_id || is_blank) // skip leading >
239  | view::char_to<value_type_t<id_type>>,
240  std::back_inserter(id));
241  }
242  }
243  else
244  {
245  detail::consume(stream_view | view::take_line_or_throw);
246  }
247  }
248 
250  template <typename stream_view_t,
251  typename seq_legal_alph_type, bool seq_qual_combined,
252  typename seq_type>
253  void read_seq(stream_view_t & stream_view,
254  sequence_file_input_options<seq_legal_alph_type, seq_qual_combined> const &,
255  seq_type & seq)
256  {
257  auto constexpr is_id = is_char<'>'> || is_char<';'>;
258 
259  if constexpr (!detail::decays_to_ignore_v<seq_type>)
260  {
261  auto constexpr is_legal_alph = is_in_alphabet<seq_legal_alph_type>;
262  std::ranges::copy(stream_view | view::take_until(is_id) // until next header (or end)
263  | ranges::view::remove_if(is_space || is_digit)// ignore whitespace and numbers
264  | view::transform([is_legal_alph] (char const c)
265  {
266  if (!is_legal_alph(c))
267  {
268  throw parse_error{std::string{"Encountered an unexpected letter: "} +
269  is_legal_alph.msg.string() +
270  " evaluated to false on " +
271  detail::make_printable(c)};
272  }
273  return c;
274  }) // enforce legal alphabet
275  | view::char_to<value_type_t<seq_type>>, // convert to actual target alphabet
276  std::back_inserter(seq));
277  }
278  else
279  {
280  detail::consume(stream_view | view::take_until(is_id));
281  }
282  }
283 
285  template <typename stream_it_t,
286  typename id_type>
287  void write_id(stream_it_t & stream_it,
288  sequence_file_output_options const & options,
289  id_type && id)
290  {
291  if (options.fasta_legacy_id_marker)
292  stream_it = ';';
293  else
294  stream_it = '>';
295 
296  if (options.fasta_blank_before_id)
297  stream_it = ' ';
298 
299  std::ranges::copy(id, stream_it);
300 
301  detail::write_eol(stream_it, options.add_carriage_return);
302  }
303 
305  template <typename stream_it_t,
306  typename seq_type>
307  void write_seq(stream_it_t & stream_it,
308  sequence_file_output_options const & options,
309  seq_type && seq)
310  {
311  if (options.fasta_letters_per_line > 0)
312  {
313  std::ranges::copy(seq | view::to_char
314  | ranges::view::chunk(options.fasta_letters_per_line)
315  | ranges::view::join(options.add_carriage_return
316  ? std::string_view{"\r\n"}
317  : std::string_view{"\n"}),
318  stream_it);
319  // TODO(h-2): benchmark the above vs:
320 // size_t count = 0;
321 // for (auto seq_it = begin(seq); seq_it != end(seq_it); ++seq_it)
322 // {
323 // stream_it = to_char(*seq_it);
324 // ++count;
325 // if (count % fasta_letters_per_line == 0)
326 // {
327 // detail::write_eol(stream_it, options.add_carriage_return);
328 // }
329 // }
330  }
331  else
332  {
333  std::ranges::copy(seq | view::to_char, stream_it);
334  }
335 
336  detail::write_eol(stream_it, options.add_carriage_return);
337  }
338 };
339 
340 } // namespace seqan3
Provides C++20 additions to the <iterator> header.
Contains various shortcuts for common std::ranges functions.
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
Provides seqan3::view::take.
void read(stream_type &stream, sequence_file_input_options< seq_legal_alph_type, seq_qual_combined > const &options, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_fasta.hpp:141
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:58
Contains seqan3::dna5, container aliases and string literals.
::ranges::ostreambuf_iterator ostreambuf_iterator
Alias for ranges::ostreambuf_iterator. Output iterator that writes to std::basic_streambuf.
Definition: ranges:230
Provides seqan3::view::take_line and seqan3::view::take_line_or_throw.
Provides seqan3::view::subrange.
The options type defines various option members that influence the behaviour of all or some formats...
Definition: input_options.hpp:52
Provides seqan3::sequence_file_output_options.
Provides seqan3::view::take_until and seqan3::view::take_until_or_throw.
Provides various utility functions.
Provides various utility functions.
void write(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_fasta.hpp:170
Provides seqan3::view::char_to.
Adaptations of concepts from the Ranges TS.
The FastA format.
Definition: format_fasta.hpp:109
std::ranges::iterator_range< it_t, sen_t > subrange
Create a view from a pair of iterator and sentinel.
Definition: subrange.hpp:96
Provides seqan3::view::transform.
The options type defines various option members that influence the behaviour of all or some formats...
Definition: output_options.hpp:48
::ranges::empty empty
Alias for ranges::empty. Checks whether a range is empty.
Definition: ranges:205
Provides seqan3::view::to_char.
Provides parse conditions for tokenization.
Provides seqan3::sequence_file_input_options.
Provides various metafunctions used by the range module.
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_fasta.hpp:126
Contains aliases for qualified.
Provides seqan3::view::take_exactly and seqan3::view::take_exactly_or_throw.