SeqAn3
format_sam.hpp
Go to the documentation of this file.
1 // ============================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ============================================================================
4 //
5 // Copyright (c) 2006-2018, Knut Reinert & Freie Universitaet Berlin
6 // Copyright (c) 2016-2018, Knut Reinert & MPI Molekulare Genetik
7 // All rights reserved.
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are met:
11 //
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
14 // * Redistributions in binary form must reproduce the above copyright
15 // notice, this list of conditions and the following disclaimer in the
16 // documentation and/or other materials provided with the distribution.
17 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
18 // its contributors may be used to endorse or promote products derived
19 // from this software without specific prior written permission.
20 //
21 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
25 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
31 // DAMAGE.
32 //
33 // ============================================================================
34 
40 #pragma once
41 
42 #include <iterator>
43 #include <string>
44 #include <string_view>
45 #include <vector>
46 
47 #include <range/v3/algorithm/copy.hpp>
48 #include <range/v3/view/remove_if.hpp>
49 
54 // #include <seqan3/io/alignment/input_options.hpp>
64 #include <seqan3/std/ranges>
65 #include <seqan3/std/concepts>
66 
67 namespace seqan3
68 {
69 
145 {
146 public:
151  alignment_file_format_sam() = default;
153  alignment_file_format_sam & operator=(alignment_file_format_sam const &) = delete;
155  alignment_file_format_sam & operator=(alignment_file_format_sam &&) = default;
156  ~alignment_file_format_sam() = default;
158 
160  static inline std::vector<std::string> file_extensions
161  {
162  { "sam" },
163  };
164 
166  template <typename stream_type,
167  typename seq_type,
168  typename id_type,
169  typename offset_type,
170  typename ref_seq_type,
171  typename ref_id_type,
172  typename ref_offset_type,
173  typename align_type,
174  typename flag_type,
175  typename mapq_type,
176  typename qual_type,
177  typename mate_type,
178  typename tag_dict_type,
179  typename e_value_type,
180  typename bit_score_type>
181  void write(stream_type & stream,
182  alignment_file_output_options const & options,
183  std::unique_ptr<alignment_file_header> & header_ptr,
184  seq_type && seq,
185  qual_type && qual,
186  id_type && id,
187  offset_type && offset,
188  ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
189  ref_id_type && ref_id,
190  ref_offset_type && ref_offset,
191  align_type && align,
192  flag_type && flag,
193  mapq_type && mapq,
194  mate_type && mate,
195  tag_dict_type && tag_dict,
196  e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
197  bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score))
198  {
199  /* Note the following general things:
200  *
201  * - Given the SAM specifications, all fields may be empty
202  *
203  * - Arithmetic values default to 0 while all others default to '*'
204  *
205  * - Because of the former, arithmetic values can be directly streamed
206  * into 'stream' as operator<< is defined for all arithmetic types
207  * and the default value (0) is also the SAM default.
208  *
209  * - All other non-arithmetic values need to be checked for emptiness
210  */
211 
212  // ---------------------------------------------------------------------
213  // Type Requirements (as static asserts for user friendliness)
214  // ---------------------------------------------------------------------
215  static_assert((std::ranges::ForwardRange<seq_type> &&
217  "The seq object must be a std::ranges::ForwardRange over "
218  "letters that model seqan3::alphabet_concept.");
219 
220  static_assert((std::ranges::ForwardRange<id_type> &&
222  "The id object must be a std::ranges::ForwardRange over "
223  "letters that model seqan3::alphabet_concept.");
224 
226  "The offset object must be a std::UnsignedIntegral.");
227 
230  "The ref_seq object must be a std::ranges::ForwardRange "
231  "over letters that model seqan3::alphabet_concept.");
232 
235  "The ref_id object must be a std::ranges::ForwardRange "
236  "over letters that model seqan3::alphabet_concept.");
237 
238  static_assert(std::Integral<remove_cvref_t<ref_offset_type>>, // -1 is given default to evaluate to 0
239  "The ref_offset object must be an std::Integral >= 0.");
240 
241  if (((ref_offset + 1) < 0))
242  throw format_error("The ref_offset object must be an std::Integral >= 0.");
243 
245  "The align object must be a std::pair of two ranges whose "
246  "value_type is comparable to seqan3::gap");
247 
248  static_assert((std::tuple_size_v<remove_cvref_t<align_type>> == 2 &&
249  std::EqualityComparableWith<gap, value_type_t<remove_cvref_t<decltype(std::get<0>(align))>>> &&
250  std::EqualityComparableWith<gap, value_type_t<remove_cvref_t<decltype(std::get<1>(align))>>>),
251  "The align object must be a std::pair of two ranges whose "
252  "value_type is comparable to seqan3::gap");
253 
255  "The flag object must be a std::UnsignedIntegral.");
256 
258  "The mapq object must be a std::UnsignedIntegral.");
259 
260  static_assert((std::ranges::ForwardRange<qual_type> &&
262  "The qual object must be a std::ranges::ForwardRange "
263  "over letters that model seqan3::alphabet_concept.");
264 
266  "The mate object must be a std::tuple of size 3 with "
267  "1) a std::ranges::ForwardRange with a value_type modelling seqan3::alphabet_concept, "
268  "2) an std::UnsignedIntegral, and"
269  "3) an std::UnsignedIntegral.");
270 
271  static_assert((std::ranges::ForwardRange<decltype(std::get<0>(mate))> &&
272  alphabet_concept<value_type_t<remove_cvref_t<decltype(std::get<0>(mate))>>> &&
273  std::UnsignedIntegral<remove_cvref_t<decltype(std::get<1>(mate))>> &&
274  std::UnsignedIntegral<remove_cvref_t<decltype(std::get<2>(mate))>>),
275  "The mate object must be a std::tuple of size 3 with "
276  "1) a std::ranges::ForwardRange with a value_type modelling seqan3::alphabet_concept, "
277  "2) an std::UnsignedIntegral, and"
278  "3) an std::UnsignedIntegral.");
279 
281  "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
282 
283  // ---------------------------------------------------------------------
284  // logical Requirements
285  // ---------------------------------------------------------------------
286  if (!empty(get<1>(align)) && empty(seq))
287  throw format_error("If you specify an align object you must also specify the seq object. "
288  "Hint: Check if offset needs to be set to if soft-clipping is present.");
289 
290  if (options.sam_require_header && (header_ptr != nullptr) && !empty(ref_id))
291  {
292  if ((header_ptr->ref_dict).count(std::string(ref_id)) == 0) // no reference id matched
293  throw format_error(std::string("The ref_id '") + std::string(ref_id) +
294  "' was not in the list of references");
295  }
296 
297  // ---------------------------------------------------------------------
298  // Writing the Header on first call
299  // ---------------------------------------------------------------------
300  if (options.sam_require_header && !written_header && (header_ptr != nullptr))
301  {
302  write_header(stream, options, header_ptr);
303  written_header = true;
304  }
305 
306  // ---------------------------------------------------------------------
307  // Writing the Record
308  // ---------------------------------------------------------------------
309  std::ranges::ostreambuf_iterator stream_it{stream};
310  char const separator{'\t'};
311 
312  write_range(stream_it, std::forward<id_type>(id));
313 
314  stream << separator;
315 
316  stream << std::forward<flag_type>(flag) << separator;
317 
318  write_range(stream_it, std::forward<ref_id_type>(ref_id));
319 
320  stream << separator;
321 
322  stream << (ref_offset + 1) << separator; // SAM is 1 based
323 
324  stream << std::forward<mapq_type>(mapq) << separator;
325 
326  if (!empty(get<1>(align)))
327  {
328  // compute possible distance from alignment end to sequence end
329  // which indicates soft clipping at the end.
330  // This should be replace by a free count_gaps function for
331  // aligned sequences which is more efficient if possible.
332  size_t off_end{seq.size() - offset};
333  for (auto chr : get<1>(align))
334  if (chr == gap::GAP)
335  ++off_end;
336  off_end -= (get<1>(align)).size();
337 
338  write_range(stream_it,
339  detail::get_cigar_string(std::forward<align_type>(align),
340  std::forward<offset_type>(offset),
341  off_end));
342  }
343  else
344  {
345  stream << '*';
346  }
347 
348  stream << separator;
349 
350  write_range(stream_it, get<0>(std::forward<mate_type>(mate)));
351 
352  stream << separator;
353 
354  stream << get<1>(std::forward<mate_type>(mate)) << separator;
355 
356  stream << get<2>(std::forward<mate_type>(mate)) << separator;
357 
358  write_range(stream_it, std::forward<seq_type>(seq));
359 
360  stream << separator;
361 
362  write_range(stream_it, std::forward<qual_type>(qual));
363 
364  write_tag_fields(stream, std::forward<tag_dict_type>(tag_dict), separator);
365 
366  detail::write_eol(stream_it, options.add_carriage_return);
367  }
368 
369 protected:
372  static constexpr char format_version[4] = "1.6";
373 
375  bool written_header{false};
376 
384  template <typename stream_it_t, typename field_type>
388  void write_range(stream_it_t & stream_it, field_type && field_value)
389  {
390  if (empty(field_value))
391  stream_it = '*';
392  else
393  std::ranges::copy(field_value | view::to_char | view::take_until(is_space), stream_it);
394  }
395 
403  template <typename stream_t>
404  void write_tag_fields(stream_t & stream, sam_tag_dictionary const & tag_dict, char const separator)
405  {
406  auto stream_variant_fn = [&stream] (auto && arg) // helper to print an std::variant
407  {
409 
410  if constexpr (!container_concept<T>)
411  {
412  stream << arg;
413  }
414  else
415  {
416  if (arg.begin() != arg.end())
417  {
418  for (auto it = arg.begin(); it != (arg.end() - 1); ++it)
419  stream << *it << ",";
420 
421  stream << *(arg.end() - 1); // write last value without trailing ','
422  }
423  }
424  };
425 
426  for (auto & [tag, variant] : tag_dict)
427  {
428  stream << separator;
429 
430  char char0 = tag / 256;
431  char char1 = tag % 256;
432 
433  stream << char0 << char1 << ':' << detail::sam_tag_type_char[variant.index()] << ':';
434 
435  if (detail::sam_tag_type_char_extra[variant.index()] != '\0')
436  stream << detail::sam_tag_type_char_extra[variant.index()] << ',';
437 
438  std::visit(stream_variant_fn, variant);
439  }
440  }
441 
458  template <typename stream_t>
459  void write_header(stream_t & stream,
460  alignment_file_output_options const & options,
461  std::unique_ptr<alignment_file_header> & header_ptr)
462  {
463  if (header_ptr != nullptr)
464  {
465  // -----------------------------------------------------------------
466  // Check Header
467  // -----------------------------------------------------------------
468 
469  // (@HD) Check header_ptr line
470  // The format version string will be taken from the local member variable
471  if (!(header_ptr->sorting == "unknown" ||
472  header_ptr->sorting == "unsorted" ||
473  header_ptr->sorting == "queryname" ||
474  header_ptr->sorting == "coordinate" ))
475  throw format_error{"SAM format error: The header_ptr->sorting member must be "
476  "one of [unknown, unsorted, queryname, coordinate]."};
477 
478  if (!(header_ptr->grouping == "none" ||
479  header_ptr->grouping == "query" ||
480  header_ptr->grouping == "reference"))
481  throw format_error{"SAM format error: The header_ptr->grouping member must be "
482  "one of [none, query, reference]."};
483 
484  // (@SQ) Check Reference Sequence Dictionary lines
485 
486  // TODO
487 
488  // - sorting order be one of ...
489  // - grouping can be one of ...
490  // - reference names must be unique
491  // - ids of read groups must be unique
492  // - program ids need to be unique
493  // many more small semantic things, like fits REGEX
494 
495  // -----------------------------------------------------------------
496  // Write Header
497  // -----------------------------------------------------------------
498  std::ranges::ostreambuf_iterator stream_it{stream};
499 
500  // (@HD) Write header_ptr line [required].
501  stream << "@HD\tVN:";
502  stream << format_version;
503 
504  if (!header_ptr->sorting.empty())
505  stream << "\tSO:" << header_ptr->sorting;
506 
507  if (!header_ptr->grouping.empty())
508  stream << "\tGO:" << header_ptr->grouping;
509 
510  detail::write_eol(stream_it, options.add_carriage_return);
511 
512  // (@SQ) Write Reference Sequence Dictionary lines [required].
513  for (auto const & [ref_name, ref_info] : header_ptr->ref_dict)
514  {
515  stream << "@SQ"
516  << "\tSN:" << ref_name
517  << "\tLN:" << get<0>(ref_info);
518 
519  if (!get<1>(ref_info).empty())
520  stream << "\t" << get<1>(ref_info);
521 
522  detail::write_eol(stream_it, options.add_carriage_return);
523  }
524 
525  // Write read group (@RG) lines if specified.
526  for (auto const & read_group : header_ptr->read_groups)
527  {
528  stream << "@RG"
529  << "\tID:" << get<0>(read_group);
530 
531  if (!get<1>(read_group).empty())
532  stream << "\t" << get<1>(read_group);
533 
534  detail::write_eol(stream_it, options.add_carriage_return);
535  }
536 
537  // Write program (@PG) lines if specified.
538  for (auto const & program : header_ptr->program_infos)
539  {
540  stream << "@PG"
541  << "\tID:" << program.id;
542 
543  if (!program.name.empty())
544  stream << "\tPN:" << program.name;
545 
546  if (!program.command_line_call.empty())
547  stream << "\tCL:" << program.command_line_call;
548 
549  if (!program.previous.empty())
550  stream << "\tPP:" << program.previous;
551 
552  if (!program.description.empty())
553  stream << "\tDS:" << program.description;
554 
555  if (!program.version.empty())
556  stream << "\tVN:" << program.version;
557 
558  detail::write_eol(stream_it, options.add_carriage_return);
559  }
560 
561  // Write comment (@CO) lines if specified.
562  for (auto const & comment : header_ptr->comments)
563  {
564  stream << "@CO\t" << comment;
565  detail::write_eol(stream_it, options.add_carriage_return);
566  }
567  }
568  }
569 };
570 
571 } // namespace seqan3
Provides seqan3::view::get.
The (most general) container concept as defined by the standard library.
void write(stream_type &stream, alignment_file_output_options const &options, std::unique_ptr< alignment_file_header > &header_ptr, seq_type &&seq, qual_type &&qual, id_type &&id, offset_type &&offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, ref_offset_type &&ref_offset, align_type &&align, flag_type &&flag, mapq_type &&mapq, mate_type &&mate, tag_dict_type &&tag_dict, e_value_type &&e_value, bit_score_type &&bit_score)
Write the given fields to the specified stream.
Definition: format_sam.hpp:181
auto constexpr take_until
A view adaptor that returns elements from the underlying range until the functor evaluates to true (o...
Definition: take_until.hpp:448
The alphabet of a gap character &#39;-&#39;.
Definition: gap.hpp:62
Thrown if information given to output format didn&#39;t match expectations.
Definition: exception.hpp:94
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides C++20 additions to the <iterator> header.
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
Whether a type behaves like a tuple.
::ranges::copy copy
Alias for ranges::copy. Copies a range of elements to a new location.
Definition: ranges:200
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition: output_options.hpp:54
The generic alphabet concept that covers most data types used in ranges.This is the core alphabet con...
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:58
Auxiliary functions for the alignment IO.
::ranges::ostreambuf_iterator ostreambuf_iterator
Alias for ranges::ostreambuf_iterator. Output iterator that writes to std::basic_streambuf.
Definition: ranges:230
Provides seqan3::alignment_file_output_options.
The concept Integral is satisfied if and only if T is an integral type.
Provides seqan3::view::take_until and seqan3::view::take_until_or_throw.
Provides seqan3::tuple_like_concept.
Provides the seqan3::alignment_file_header class.
Provides various utility functions.
Provides seqan3::view::char_to.
The Concepts library.
std::remove_cv_t< std::remove_reference_t< t > > remove_cvref_t
Return the input type with const, volatile and references removed [Type metafunction].
Definition: basic.hpp:64
Adaptations of concepts from the Ranges TS.
bool sam_require_header
Whether to require a header for SAM files.
Definition: output_options.hpp:68
The concept std::Same<T, U> is satisfied if and only if T and U denote the same type.
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:88
Specifies requirements of a Range type for which begin returns a type that models std::ForwardIterato...
The options type defines various option members that influence the behavior of all or some formats...
Definition: output_options.hpp:49
auto constexpr is_space
Checks whether c is a space character.
Definition: parse_condition.hpp:171
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_sam.hpp:161
::ranges::empty empty
Alias for ranges::empty. Checks whether a range is empty.
Definition: ranges:205
Provides seqan3::view::to_char.
Provides parse conditions for tokenization.
Requires std::detail::WeaklyEqualityComparableWitht<t1,t2>, but also that t1 and t2, as well as their common_reference_t satisfy std::EqualityComparable.
Provides various metafunctions used by the range module.
typename value_type< t >::type value_type_t
Type metafunction shortcut for seqan3::value_type.
Definition: pre.hpp:72
The concept std::UnsignedIntegral is satisfied if and only if T is an integral type and std::is_signe...
The SAM format.
Definition: format_sam.hpp:144
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:339