SeqAn3
input.hpp
Go to the documentation of this file.
1 // ============================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ============================================================================
4 //
5 // Copyright (c) 2006-2018, Knut Reinert & Freie Universitaet Berlin
6 // Copyright (c) 2016-2018, Knut Reinert & MPI Molekulare Genetik
7 // All rights reserved.
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are met:
11 //
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
14 // * Redistributions in binary form must reproduce the above copyright
15 // notice, this list of conditions and the following disclaimer in the
16 // documentation and/or other materials provided with the distribution.
17 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
18 // its contributors may be used to endorse or promote products derived
19 // from this software without specific prior written permission.
20 //
21 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
25 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
31 // DAMAGE.
32 //
33 // ============================================================================
34 
40 #pragma once
41 
42 #include <cassert>
43 #include <fstream>
44 #include <string>
45 #include <variant>
46 #include <vector>
47 
48 #include <range/v3/algorithm/equal.hpp>
49 
57 #include <seqan3/io/exception.hpp>
58 #include <seqan3/io/filesystem.hpp>
59 #include <seqan3/io/record.hpp>
61 #include <seqan3/io/detail/record.hpp>
66 
67 namespace seqan3
68 {
69 
70 // ----------------------------------------------------------------------------
71 // sequence_file_input_traits_concept
72 // ----------------------------------------------------------------------------
73 
144 template <typename t>
147 concept sequence_file_input_traits_concept = requires (t v)
148 {
149  requires alphabet_concept<typename t::sequence_alphabet>;
150  requires alphabet_concept<typename t::sequence_legal_alphabet>;
151  requires explicitly_convertible_to_concept<typename t::sequence_legal_alphabet, typename t::sequence_alphabet>;
152  requires sequence_container_concept<typename t::template sequence_container<typename t::sequence_alphabet>>;
153  requires sequence_container_concept<typename t::template sequence_container_container<
154  typename t::template sequence_container<typename t::sequence_alphabet>>>;
155 
156  requires alphabet_concept<typename t::id_alphabet>;
157  requires sequence_container_concept<typename t::template id_container<typename t::id_alphabet>>;
158  requires sequence_container_concept<typename t::template id_container_container<typename t::template id_container<
159  typename t::id_alphabet>>>;
160 
161  requires quality_concept<typename t::quality_alphabet>;
162  requires sequence_container_concept<typename t::template quality_container<typename t::quality_alphabet>>;
163  requires sequence_container_concept<typename t::template quality_container_container<
164  typename t::template quality_container<typename t::quality_alphabet>>>;
165 };
167 
168 // ----------------------------------------------------------------------------
169 // sequence_file_input_default_traits
170 // ----------------------------------------------------------------------------
171 
198 {
203  using sequence_alphabet = dna5;
205  template <typename _sequence_alphabet>
206  using sequence_container = std::vector<_sequence_alphabet>;
207  template <typename _sequence_container>
209 
210  using id_alphabet = char;
211  template <typename _id_alphabet>
212  using id_container = std::basic_string<_id_alphabet>;
213  template <typename _id_container>
215 
216  using quality_alphabet = phred42;
217  template <typename _quality_alphabet>
218  using quality_container = std::vector<_quality_alphabet>;
219  template <typename _quality_container>
222 };
223 
227 {
232  using sequence_alphabet = aa27;
235 };
236 
237 // ----------------------------------------------------------------------------
238 // sequence_file_input
239 // ----------------------------------------------------------------------------
240 
462 template <
463  sequence_file_input_traits_concept traits_type_ = sequence_file_input_default_traits_dna,
464  detail::fields_concept selected_field_ids_ = fields<field::SEQ,
465  field::ID,
466  field::QUAL>,
467  detail::type_list_of_sequence_file_input_formats_concept valid_formats_ = type_list<sequence_file_format_fasta,
469  /*, ...*/>,
470  istream_concept<char> stream_type_ = std::ifstream>
472 {
473 public:
478  using traits_type = traits_type_;
481  using selected_field_ids = selected_field_ids_;
483  using valid_formats = valid_formats_;
485  using stream_type = stream_type_;
487 
492 
493  static_assert([] () constexpr
494  {
495  for (field f : selected_field_ids::as_array)
496  if (!field_ids::contains(f))
497  return false;
498  return true;
499  }(),
500  "You selected a field that is not valid for sequence files, please refer to the documentation "
501  "of sequence_file_input::field_ids for the accepted values.");
502 
503  static_assert([] () constexpr
504  {
505  return !(selected_field_ids::contains(field::SEQ_QUAL) &&
506  (selected_field_ids::contains(field::SEQ) ||
507  (selected_field_ids::contains(field::QUAL))));
508  }(),
509  "You may not select field::SEQ_QUAL and either of field::SEQ and field::QUAL at the same time.");
510 
516  using sequence_type = typename traits_type::template sequence_container<
518  typename traits_type::sequence_alphabet>;
520  using id_type = typename traits_type::template id_container<
521  typename traits_type::id_alphabet>;
523  using quality_type = typename traits_type::template quality_container<
524  typename traits_type::quality_alphabet>;
526  using sequence_quality_type = typename traits_type::
527  template sequence_container<qualified<typename traits_type::sequence_alphabet,
528  typename traits_type::quality_alphabet>>;
529 
532 
537 
543  using sequence_column_type = typename traits_type::template sequence_container_container<sequence_type>;
546  using id_column_type = typename traits_type::template id_container_container<id_type>;
548  using quality_column_type = typename traits_type::template quality_container_container<quality_type>;
550  using sequence_quality_column_type = typename traits_type::template id_container_container<sequence_quality_type>;
557  using file_as_tuple_type = record<detail::select_types_with_ids_t<field_column_types,
558  field_ids,
560  selected_field_ids>;
562 
567  using value_type = record_type;
572  using const_reference = void;
574  using size_type = size_t;
576  using difference_type = std::make_signed_t<size_t>;
578  using iterator = detail::in_file_iterator<sequence_file_input>;
580  using const_iterator = void;
584 
588  sequence_file_input() = delete;
591  sequence_file_input(sequence_file_input const &) = delete;
593  sequence_file_input & operator=(sequence_file_input const &) = delete;
597  sequence_file_input & operator=(sequence_file_input &&) = default;
599  ~sequence_file_input() = default;
600 
610  sequence_file_input(filesystem::path const & _file_name,
611  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{})
612  {
613  // open stream
614  stream.open(_file_name, std::ios_base::in | std::ios::binary);
615  if (!stream.is_open())
616  throw file_open_error{"Could not open file for reading."};
617 
618  // initialise format handler
619  bool format_found = false;
620  std::string extension = _file_name.extension().string();
621  if (extension.size() > 1)
622  {
623  extension = extension.substr(1); // drop leading "."
624  meta::for_each(valid_formats{}, [&] (auto && fmt)
625  {
626  using fmt_type = remove_cvref_t<decltype(fmt)>;
627 
628  for (auto const & ext : fmt_type::file_extensions)
629  {
630  if (std::ranges::equal(ext, extension))
631  {
632  format = fmt_type{};
633  format_found = true;
634  return;
635  }
636  }
637  });
638  }
639  if (!format_found)
640  throw unhandled_extension_error("No valid format found for this extension.");
641 
642  // buffer first record
643  read_next_record();
644  }
645  /* NOTE(h-2): Curiously we do not need a user-defined deduction guide for the above constructor.
646  * A combination of default template parameters and auto-deduction guides works as expected,
647  * independent of whether the second/optional parameter is specified or not, i.e. it is possible
648  * to auto-deduct and overwrite a single template parameter out of the four if the optional parameter
649  * is specified and use the default otherwise.
650  */
651 
658  template <sequence_file_input_format_concept file_format>
660  file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
661  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
662  stream{std::move(_stream)}, format{file_format{}}
663  {
664  static_assert(meta::in<valid_formats, file_format>::value,
665  "You selected a format that is not in the valid_formats of this file.");
666 
667  // buffer first record
668  read_next_record();
669  }
671 
689  iterator begin() noexcept
690  {
691  return {*this};
692  }
693 
707  sentinel end() noexcept
708  {
709  return {};
710  }
711 
748  reference front() noexcept
749  {
750  return record_buffer;
751  }
753 
758  template <field f>
760  friend auto & get(sequence_file_input & file)
761  {
762  static_assert(sequence_file_input::selected_field_ids::contains(f),
763  "You requested a field via get that was not selected for the file.");
764 
765  file.read_columns();
766 
767  return seqan3::get<f>(file.columns_buffer);
768  }
769 
771  template <field f>
772  friend auto && get(sequence_file_input && file)
773  {
774  return std::move(get<f>(file));
775  }
776 
778  template <size_t i>
779  friend auto & get(sequence_file_input & file)
780  {
781  static_assert(i < sequence_file_input::selected_field_ids::as_array.size(),
782  "You requested a field number larger than the number of selected fields for the file.");
783  file.read_columns();
784 
785  return std::get<i>(file.columns_buffer);
786  }
787 
789  template <size_t i>
790  friend auto && get(sequence_file_input && file)
791  {
792  return std::move(get<i>(file));
793  }
794 
796  template <typename t>
797  friend auto & get(sequence_file_input & file)
798  {
799  file.read_columns();
800 
801  return std::get<t>(file.columns_buffer);
802  }
803 
805  template <typename t>
806  friend auto && get(sequence_file_input && file)
807  {
808  return std::move(get<t>(file));
809  }
811 
813  sequence_file_input_options<typename traits_type::sequence_legal_alphabet,
814  selected_field_ids::contains(field::SEQ_QUAL)> options;
815 
816 protected:
818 
821  record_type record_buffer;
824  file_as_tuple_type columns_buffer;
826 
828  std::string file_name;
829 
831  stream_type stream;
832 
834  bool at_end{false};
835 
837  using format_type = detail::transfer_template_args_onto_t<valid_formats, std::variant>;
839  format_type format;
840 
842  void read_next_record()
843  {
844  if (at_end)
845  return;
846 
847  // clear the record
848  record_buffer.clear();
849 
850  // at end if we could not read further
851  if (stream.eof())
852  {
853  at_end = true;
854  return;
855  }
856 
857  assert(!format.valueless_by_exception());
858  std::visit([&] (sequence_file_input_format_concept & f)
859  {
860  // read new record
861  if constexpr (selected_field_ids::contains(field::SEQ_QUAL))
862  {
863  f.read(stream,
864  options,
865  detail::get_or_ignore<field::SEQ_QUAL>(record_buffer),
866  detail::get_or_ignore<field::ID>(record_buffer),
867  detail::get_or_ignore<field::SEQ_QUAL>(record_buffer));
868  }
869  else
870  {
871  f.read(stream,
872  options,
873  detail::get_or_ignore<field::SEQ>(record_buffer),
874  detail::get_or_ignore<field::ID>(record_buffer),
875  detail::get_or_ignore<field::QUAL>(record_buffer));
876  }
877  }, format);
878  }
879 
881  void read_columns()
882  {
883  //TODO don't do multiple visits
884  //TODO create specialised version for concatenated_sequences where we append on the concat
885  auto & sequence_column_buffer = detail::get_or_ignore<field::SEQ>(columns_buffer);
886  auto & id_column_buffer = detail::get_or_ignore<field::ID>(columns_buffer);
887  auto & qual_column_buffer = detail::get_or_ignore<field::QUAL>(columns_buffer);
888  auto & seq_qual_column_buffer = detail::get_or_ignore<field::SEQ_QUAL>(columns_buffer);
889 
890  // read the remaining records and split into column buffers
891  for (auto & rec : *this)
892  {
893  if constexpr (selected_field_ids::contains(field::SEQ))
894  sequence_column_buffer.push_back(std::move(seqan3::get<field::SEQ>(rec)));
895  if constexpr (selected_field_ids::contains(field::ID))
896  id_column_buffer.push_back(std::move(seqan3::get<field::ID>(rec)));
897  if constexpr (selected_field_ids::contains(field::QUAL))
898  qual_column_buffer.push_back(std::move(seqan3::get<field::QUAL>(rec)));
899  if constexpr (selected_field_ids::contains(field::SEQ_QUAL))
900  seq_qual_column_buffer.push_back(std::move(seqan3::get<field::SEQ_QUAL>(rec)));
901  }
902  }
903 
905  friend iterator;
906 };
907 
912 template <istream_concept<char> stream_type,
913  sequence_file_input_format_concept file_format,
914  detail::fields_concept selected_field_ids>
915 sequence_file_input(stream_type && _stream, file_format const &, selected_field_ids const &)
917  selected_field_ids,
919  std::remove_reference_t<stream_type>>;
921 
922 } // namespace seqan3
923 
924 // ------------------------------------------------------------------
925 // std-overloads for the tuple-like interface
926 // ------------------------------------------------------------------
927 
928 namespace std
929 {
932  seqan3::detail::fields_concept selected_field_ids,
933  seqan3::detail::type_list_of_sequence_file_input_formats_concept valid_formats,
935 struct tuple_size<seqan3::sequence_file_input<traits_type, selected_field_ids, valid_formats, stream_type>>
936 {
938  static constexpr size_t value = selected_field_ids::as_array.size();
939 };
940 
942 template <size_t elem_no,
944  seqan3::detail::fields_concept selected_field_ids,
945  seqan3::detail::type_list_of_sequence_file_input_formats_concept valid_formats,
946  seqan3::istream_concept<char> stream_type>
947 struct tuple_element<elem_no, seqan3::sequence_file_input<traits_type, selected_field_ids, valid_formats, stream_type>>
948  : tuple_element<elem_no, typename seqan3::sequence_file_input<traits_type,
949  selected_field_ids,
950  valid_formats,
951  stream_type>::file_as_tuple_type>
952 {};
953 
954 } // namespace std
Contains quality alphabet compositions.
stream_type_ stream_type
The type of the underlying stream.
Definition: input.hpp:485
The FastQ format.
Definition: format_fastq.hpp:106
typename traits_type::template id_container_container< sequence_quality_type > sequence_quality_column_type
Column type of field::SEQ_QUAL (seqan3::concatenated_sequences<sequence_quality_type> by default)...
Definition: input.hpp:550
Provides seqan3::sequence_file_input_format_concept and auxiliary classes.
Thrown if there is no format that accepts a given file extension.
Definition: exception.hpp:54
iterator begin() noexcept
Returns an iterator to current position in the file.
Definition: input.hpp:689
The "sequence", usually a range of nucleotides or amino acids.
selected_field_ids_ selected_field_ids
A seqan3::fields list with the fields selected for the record.
Definition: input.hpp:481
Provides exceptions used in the I/O module.
valid_formats_ valid_formats
A seqan3::type_list with the possible formats.
Definition: input.hpp:483
typename traits_type::template id_container_container< id_type > id_column_type
Column type of field::ID (seqan3::concatenated_sequences<id_type> by default).
Definition: input.hpp:546
sentinel end() noexcept
Returns a sentinel for comparison with iterator.
Definition: input.hpp:707
size_t size_type
An unsigned integer type, usually std::size_t.
Definition: input.hpp:574
Provides the seqan3::sequence_file_format_fastq class.
The generic concept for sequence file in formats.
typename traits_type::template sequence_container_container< sequence_type > sequence_column_type
Column type of field::SEQ (seqan3::concatenated_sequences<sequence_type> by default).
Definition: input.hpp:544
The default traits for seqan3::sequence_file_input.
Definition: input.hpp:197
Contains seqan3::aa27, container aliases and string literals.
SeqAn specific customisations in the standard namespace.
Definition: align_result.hpp:221
sequence_file_input(stream_type &&_stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from an existing stream and with specified format.
Definition: input.hpp:659
typename traits_type::template id_container< typename traits_type::id_alphabet > id_type
The type of field::ID (std::string by defaul).
Definition: input.hpp:521
The 15 letter DNA alphabet, containing all IUPAC smybols minus the gap.
Definition: dna15.hpp:73
::ranges::size size
Alias for ranges::size. Obtains the size of a range whose size can be calculated in constant time...
Definition: ranges:195
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:58
Concept for input streams.
reference front() noexcept
Return the record we are currently at in the file.
Definition: input.hpp:748
The qualities, usually in phred-score notation.
void read(stream_type &stream, seqan3::sequence_file_input_options const &options, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
typename traits_type::template quality_container_container< quality_type > quality_column_type
Column type of field::QUAL (seqan3::concatenated_sequences<quality_type> by default).
Definition: input.hpp:548
The requirements a traits_type for seqan3::sequence_file_input must meet.
Joins an arbitrary alphabet with a quality alphabet.
Definition: qualified.hpp:85
Provides seqan3::concatenated_sequences.
std::make_signed_t< size_t > difference_type
A signed integer type, usually std::ptrdiff_t.
Definition: input.hpp:576
The twenty-seven letter amino acid alphabet.
Definition: aa27.hpp:67
detail::in_file_iterator< sequence_file_input > iterator
The iterator type of this view (an input iterator).
Definition: input.hpp:578
Thrown if there is an unspecified filesystem or stream error while opening, e.g. permission problem...
Definition: exception.hpp:62
A class template that holds a choice of seqan3::field.
Definition: record.hpp:136
Container that stores sequences concatenated internally.
Definition: concatenated_sequences.hpp:117
The five letter DNA alphabet of A,C,G,T and the unknown character N.
Definition: dna5.hpp:73
The options type defines various option members that influence the behaviour of all or some formats...
Definition: input_options.hpp:52
std::ranges::default_sentinel sentinel
The type returned by end().
Definition: input.hpp:582
Sequence and qualities combined in one range.
traits_type_ traits_type
A traits type that defines aliases and template for storage of the fields.
Definition: input.hpp:479
void const_reference
The const_reference type is void, because files are not const-iterable.
Definition: input.hpp:572
Provides alphabet adaptations for standard char types.
A traits type that specifies input as amino acids.
Definition: input.hpp:226
type_list< sequence_type, id_type, quality_type, sequence_quality_type > field_types
The previously defined types aggregated in a seqan3::type_list.
Definition: input.hpp:531
Provides the seqan3::record template and the seqan3::field enum.
The identifier, usually a string.
std::remove_cv_t< std::remove_reference_t< t > > remove_cvref_t
Return the input type with const, volatile and references removed [Type metafunction].
Definition: basic.hpp:64
The FastA format.
Definition: format_fasta.hpp:109
void const_iterator
The const iterator type is void, because files are not const-iterable.
Definition: input.hpp:580
sequence_file_input(filesystem::path const &_file_name, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from filename.
Definition: input.hpp:610
sequence_file_input_options< typename traits_type::sequence_legal_alphabet, selected_field_ids::contains(field::SEQ_QUAL)> options
The options are public and its members can be set directly.
Definition: input.hpp:814
Stream concepts.
Contains seqan3::phred42 quality scores.
This header includes C++17 filesystem support and imports it into namespace seqan3::filesystem (indep...
::ranges::equal equal
Alias for ranges::equal. Determines if two sets of elements are the same.
Definition: ranges:210
Provides various metafunctions on generic types.
Meta-header for the nucleotide submodule; includes all headers from alphabet/nucleotide/.
meta::list< types... > type_list
Type that contains multiple types, an alias for meta::list.
Definition: type_list.hpp:54
field
An enumerator for the fields used in file formats.Some of the fields are shared between formats...
Definition: record.hpp:63
A class for reading sequence files, e.g. FASTA, FASTQ ...
Definition: input.hpp:471
type_list< sequence_column_type, id_column_type, quality_column_type, sequence_quality_column_type > field_column_types
The previously defined types aggregated in a seqan3::type_list.
Definition: input.hpp:555
typename traits_type::template quality_container< typename traits_type::quality_alphabet > quality_type
The type of field::QUAL (std::vector <seqan3::phred42> by default).
Definition: input.hpp:524
Quality type for traditional Sanger and modern Illumina Phred scores (typical range).
Definition: phred42.hpp:69
typename traits_type::template sequence_container< qualified< typename traits_type::sequence_alphabet, typename traits_type::quality_alphabet > > sequence_quality_type
The type of field::SEQ_QUAL (std::vector <seqan3::dna5q> by default).
Definition: input.hpp:528
Provides the seqan3::detail::in_file_iterator class template.
typename traits_type::template sequence_container< typename traits_type::sequence_alphabet > sequence_type
The type of field::SEQ (std::vector <seqan3::dna5> by default).
Definition: input.hpp:518
Provides the seqan3::sequence_file_format_fasta class.
::ranges::default_sentinel default_sentinel
Alias for ranges::default_sentinel. Empty sentinel type for use with iterator types that know the bou...
Definition: ranges:215