SeqAn3
format_fastq.hpp
Go to the documentation of this file.
1 // ============================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ============================================================================
4 //
5 // Copyright (c) 2006-2018, Knut Reinert & Freie Universitaet Berlin
6 // Copyright (c) 2016-2018, Knut Reinert & MPI Molekulare Genetik
7 // All rights reserved.
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are met:
11 //
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
14 // * Redistributions in binary form must reproduce the above copyright
15 // notice, this list of conditions and the following disclaimer in the
16 // documentation and/or other materials provided with the distribution.
17 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
18 // its contributors may be used to endorse or promote products derived
19 // from this software without specific prior written permission.
20 //
21 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
25 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
31 // DAMAGE.
32 //
33 // ============================================================================
34 
40 #pragma once
41 
42 #include <iterator>
43 #include <string>
44 #include <string_view>
45 #include <vector>
46 
47 #include <range/v3/algorithm/copy.hpp>
48 #include <range/v3/utility/iterator.hpp>
49 #include <range/v3/view/chunk.hpp>
50 #include <range/v3/view/join.hpp>
51 #include <range/v3/view/remove_if.hpp>
52 
69 #include <seqan3/std/ranges>
72 
73 namespace seqan3
74 {
107 {
108 public:
113  sequence_file_format_fastq() = default;
115  sequence_file_format_fastq & operator=(sequence_file_format_fastq const &) = delete;
117  sequence_file_format_fastq & operator=(sequence_file_format_fastq &&) = default;
119 
121  static inline std::vector<std::string> file_extensions
122  {
123  { "fastq" },
124  { "fq" }
125  };
126 
128  template <typename stream_type, // constraints checked by file
129  typename seq_legal_alph_type, bool seq_qual_combined,
130  typename seq_type, // other constraints checked inside function
131  typename id_type,
132  typename qual_type>
133  void read(stream_type & stream,
135  seq_type & sequence,
136  id_type & id,
137  qual_type & qualities)
138  {
139  auto stream_view = view::subrange<decltype(std::istreambuf_iterator<char>{stream}),
140  decltype(std::istreambuf_iterator<char>{})>
141  {std::istreambuf_iterator<char>{stream},
142  std::istreambuf_iterator<char>{}};
143 
144  auto stream_it = begin(stream_view);
145 
146  // cache the begin position so we write quals to the same position as seq in seq_qual case
147  size_t sequence_size_before = 0;
148  size_t sequence_size_after = 0;
149  if constexpr (!detail::decays_to_ignore_v<seq_type>)
150  sequence_size_before = size(sequence);
151 
152  /* ID */
153  if (*stream_it != '@') // [[unlikely]]
154  {
155  throw parse_error{std::string{"Expected '@' on beginning of ID line, got: "} +
156  detail::make_printable(*stream_it)};
157  }
158  ++stream_it; // skip '@'
159 
160  if constexpr (!detail::decays_to_ignore_v<id_type>)
161  {
162  if (options.truncate_ids)
163  {
166  std::back_inserter(id));
167  detail::consume(stream_view | view::take_line_or_throw);
168  }
169  else
170  {
173  std::back_inserter(id));
174  }
175  }
176  else
177  {
178  detail::consume(stream_view | view::take_line_or_throw);
179  }
180 
181  /* Sequence */
182  auto seq_view = stream_view | view::take_until_or_throw(is_char<'+'>) // until 2nd ID line
183  | ranges::view::remove_if(is_space); // ignore whitespace
184  if constexpr (!detail::decays_to_ignore_v<seq_type>)
185  {
186  auto constexpr is_legal_alph = is_in_alphabet<seq_legal_alph_type>;
187  std::ranges::copy(seq_view | view::transform([is_legal_alph] (char const c) // enforce legal alphabet
188  {
189  if (!is_legal_alph(c))
190  {
191  throw parse_error{std::string{"Encountered an unexpected letter: "} +
192  is_legal_alph.msg.string() +
193  " evaluated to false on " +
194  detail::make_printable(c)};
195  }
196  return c;
197  })
198  | view::char_to<value_type_t<seq_type>>, // convert to actual target alphabet
199  std::back_inserter(sequence));
200  sequence_size_after = size(sequence);
201  }
202  else // consume, but count
203  {
204  auto it = begin(seq_view);
205  auto it_end = end(seq_view);
206  while (it != it_end)
207  {
208  ++it;
209  ++sequence_size_after;
210  }
211  }
212 
213  /* 2nd ID line */
214  if (*stream_it != '+') // [[unlikely]]
215  {
216  throw parse_error{std::string{"Expected '+' on beginning of 2nd ID line, got: "} +
217  detail::make_printable(*stream_it)};
218  }
219  detail::consume(stream_view | view::take_line_or_throw);
220 
221  /* Qualities */
222  auto qview = stream_view | ranges::view::remove_if(is_space) // this consumes trailing newline
223  | view::take_exactly_or_throw(sequence_size_after - sequence_size_before);
224  if constexpr (seq_qual_combined)
225  {
226  // seq_qual field implies that they are the same variable
227  assert(std::addressof(sequence) == std::addressof(qualities));
229  begin(qualities) + sequence_size_before);
230  }
231  else if constexpr (!detail::decays_to_ignore_v<qual_type>)
232  {
234  std::back_inserter(qualities));
235  }
236  else
237  {
238  detail::consume(qview);
239  }
240 
241  // make sure "buffer at end" implies "stream at end"
242  if ((std::istreambuf_iterator<char>{stream} == std::istreambuf_iterator<char>{}) &&
243  (!stream.eof()))
244  {
245  stream.get(); // triggers error in stream and sets eof
246  }
247  }
248 
250  template <typename stream_type, // constraints checked by file
251  typename seq_type, // other constraints checked inside function
252  typename id_type,
253  typename qual_type>
254  void write(stream_type & stream,
255  sequence_file_output_options const & options,
256  seq_type && sequence,
257  id_type && id,
258  qual_type && qualities)
259  {
260  std::ranges::ostreambuf_iterator stream_it{stream};
261 
262  // ID
263  if constexpr (detail::decays_to_ignore_v<id_type>)
264  {
265  throw std::logic_error{"The ID field may not be set to ignore when writing FASTQ files."};
266  }
267  else
268  {
269  if (empty(id)) //[[unlikely]]
270  throw std::runtime_error{"The ID field may not be empty when writing FASTQ files."};
271 
272  stream_it = '@';
273  std::ranges::copy(id, stream_it);
274 
275  detail::write_eol(stream_it, options.add_carriage_return);
276  }
277 
278  // Sequence
279  if constexpr (detail::decays_to_ignore_v<seq_type>)
280  {
281  throw std::logic_error{"The SEQ and SEQ_QUAL fields may not both be set to ignore when writing FASTQ files."};
282  }
283  else
284  {
285  if (empty(sequence)) //[[unlikely]]
286  throw std::runtime_error{"The SEQ field may not be empty when writing FASTQ files."};
287 
288  std::ranges::copy(sequence | view::to_char, stream_it);
289 
290  detail::write_eol(stream_it, options.add_carriage_return);
291  }
292 
293  // 2nd ID-line
294  if constexpr (!detail::decays_to_ignore_v<id_type>)
295  {
296  stream_it = '+';
297 
298  if (options.fastq_double_id)
299  std::ranges::copy(id, stream_it);
300 
301  detail::write_eol(stream_it, options.add_carriage_return);
302  }
303 
304  // Quality line
305  if constexpr (detail::decays_to_ignore_v<qual_type>)
306  {
307  throw std::logic_error{"The QUAL and SEQ_QUAL fields may not both be set to ignore when writing FASTQ files."};
308  }
309  else
310  {
311  if (empty(qualities)) //[[unlikely]]
312  throw std::runtime_error{"The SEQ field may not be empty when writing FASTQ files."};
313 
315  {
316  assert(size(sequence) == size(qualities));
317  }
318 
319  std::ranges::copy(qualities | view::to_char, stream_it);
320 
321  detail::write_eol(stream_it, options.add_carriage_return);
322  }
323  }
324 };
325 
326 } // namespace seqan3
The FastQ format.
Definition: format_fastq.hpp:106
constexpr auto transform
A range adaptor that takes a invocable and returns a view of the elements with the invocable applied...
Definition: transform.hpp:95
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_fastq.hpp:122
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition: input_options.hpp:55
Provides C++20 additions to the <iterator> header.
Contains various shortcuts for common std::ranges functions.
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
Provides seqan3::view::take.
auto constexpr is_blank
Checks whether c is a blank character.
Definition: parse_condition.hpp:188
::ranges::copy copy
Alias for ranges::copy. Copies a range of elements to a new location.
Definition: ranges:200
::ranges::size size
Alias for ranges::size. Obtains the size of a range whose size can be calculated in constant time...
Definition: ranges:195
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:58
Contains seqan3::dna5, container aliases and string literals.
bool fastq_double_id
Whether to write the ID only &#39;@&#39; or also after &#39;+&#39; line.
Definition: output_options.hpp:61
::ranges::ostreambuf_iterator ostreambuf_iterator
Alias for ranges::ostreambuf_iterator. Output iterator that writes to std::basic_streambuf.
Definition: ranges:230
Provides seqan3::view::take_line and seqan3::view::take_line_or_throw.
Provides seqan3::view::subrange.
void read(stream_type &stream, sequence_file_input_options< seq_legal_alph_type, seq_qual_combined > const &options, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_fastq.hpp:133
The options type defines various option members that influence the behaviour of all or some formats...
Definition: input_options.hpp:52
auto constexpr is_cntrl
Checks whether c is a control character.
Definition: parse_condition.hpp:135
Thrown if there is a parse error, such as reading an unexpected character from an input stream...
Definition: exception.hpp:70
Provides seqan3::sequence_file_output_options.
Provides seqan3::view::take_until and seqan3::view::take_until_or_throw.
Provides various utility functions.
Provides various utility functions.
Provides seqan3::view::char_to.
auto constexpr take_line_or_throw
A view adaptor that returns a single line from the underlying range (throws if there is no end-of-lin...
Definition: take_line.hpp:471
Adaptations of concepts from the Ranges TS.
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:88
std::ranges::iterator_range< it_t, sen_t > subrange
Create a view from a pair of iterator and sentinel.
Definition: subrange.hpp:96
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition: output_options.hpp:66
auto constexpr is_space
Checks whether c is a space character.
Definition: parse_condition.hpp:171
void write(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_fastq.hpp:254
auto constexpr take_until_or_throw
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until.hpp:462
Provides seqan3::view::transform.
The options type defines various option members that influence the behaviour of all or some formats...
Definition: output_options.hpp:48
::ranges::empty empty
Alias for ranges::empty. Checks whether a range is empty.
Definition: ranges:205
Provides seqan3::view::to_char.
Provides parse conditions for tokenization.
Provides seqan3::sequence_file_input_options.
Provides various metafunctions used by the range module.
typename value_type< t >::type value_type_t
Type metafunction shortcut for seqan3::value_type.
Definition: pre.hpp:72
Contains aliases for qualified.
constexpr auto back_inserter(container_t &container)
Create a std::back_insert_iterator for the argument.
Definition: iterator:79
Specifies the requirements of a Range type that knows its size in constant time with the size functio...
Provides seqan3::view::take_exactly and seqan3::view::take_exactly_or_throw.
auto constexpr take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly.hpp:122
auto const char_to
A view over an alphabet, given a range of characters.
Definition: char_to.hpp:92