diff --git a/.gitignore b/.gitignore index 4fffb2f..6b5ff3f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ /target /Cargo.lock +.venv +__pycache__ diff --git a/Cargo.toml b/Cargo.toml index b98299d..62a37b2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,3 @@ [workspace] -members = ["nafcodec", "nafcodec-py"] +members = ["nafcodec", "nafcodec-py", "nafcodec-ffi", "naf_rs"] resolver = "2" diff --git a/data/test-runner.pl b/data/test-runner.pl new file mode 100755 index 0000000..1d61d3b --- /dev/null +++ b/data/test-runner.pl @@ -0,0 +1,89 @@ +#!/usr/bin/env perl +# +# Test runner script +# Copyright (c) 2018-2021 Kirill Kryukov +# See README.md and LICENSE files of this repository +# + +use strict; +use File::Basename qw(basename dirname); +use File::Glob qw(:bsd_glob); + +my @tests = @ARGV; +if (!scalar @tests) { die "Tests to run are not specified\n"; } + +my $null = ($^O =~ /MSWin/) ? 'nul' : '/dev/null'; +my ($n_tests_total, $n_tests_passed) = (0, 0); + +foreach my $test (@tests) +{ + if (-e $test and -d $test) { run_test_set($test); } + else { run_test($test); } +} + +print "$n_tests_passed out of $n_tests_total tests passed\n"; +if ($n_tests_passed != $n_tests_total) { exit 1; } + + +sub run_test_set +{ + my ($dir) = @_; + print "===== $dir =====\n"; + if (!-e $dir or !-d $dir) { die "Can't find test set directory \"$dir\"\n"; } + foreach my $file (bsd_glob("$dir/*.test")) { run_test($file); } +} + +sub run_test +{ + my ($test_file) = @_; + my ($dir, $name) = (dirname($test_file), basename($test_file)); + $name =~ s/\.test$//; + my $test_prefix = "$dir/$name"; + my $group = $name; + $group =~ s/-.*$//; + my $group_prefix = "$dir/$group"; + + print "[$dir/$name] "; + $n_tests_total++; + + open(my $TEST, '<', $test_file) or die "Can't open \"$test_file\"\n"; + binmode $TEST; + my @cmds; + while (<$TEST>) + { + s/[\x0D\x0A]+$//; + my $cmd = $_; + $cmd =~ s/ennaf/..\/ennaf\/ennaf --binary-stderr/g; + $cmd =~ s/unnaf/..\/unnaf\/unnaf --binary-stderr --binary-stdout/g; + $cmd =~ s/\{TEST\}/$test_prefix/g; + $cmd =~ s/\{GROUP\}/$group_prefix/g; + push @cmds, $cmd; + system($cmd); + } + close $TEST; + + my @errors; + foreach my $ref_file (bsd_glob("$dir/$name.*-ref")) + { + my $out_file = $ref_file; + $out_file =~ s/-ref$//; + if (-e $out_file) + { + my $cmperr = system("diff -q $ref_file $out_file >$null"); + if ($cmperr != 0) { push @errors, "\"" . basename($out_file) . "\" differs from \"" . basename($ref_file) . "\""; } + } + else { push @errors, "Can't find output file \"$out_file\""; } + } + + if (scalar(@errors) == 0) + { + print "OK\n"; + $n_tests_passed++; + } + else + { + print "FAIL\n"; + print "Commands:\n", join("\n", map { " $_" } @cmds), "\n"; + print "Errors:\n", join("\n", map { " $_" } @errors), "\n"; + } +} diff --git a/naf_rs/Cargo.toml b/naf_rs/Cargo.toml new file mode 100644 index 0000000..ca159c5 --- /dev/null +++ b/naf_rs/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "naf_rs" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +bio = "2.0.1" +clap = { version = "4.5.9", features = ["derive", "env"] } +nafcodec = { version = "0.2.0", path = "../nafcodec" } +tempfile = "3.10.1" diff --git a/naf_rs/LuxC.naf b/naf_rs/LuxC.naf new file mode 100644 index 0000000..6acbfa8 Binary files /dev/null and b/naf_rs/LuxC.naf differ diff --git a/naf_rs/src/ennaf.rs b/naf_rs/src/ennaf.rs new file mode 100644 index 0000000..0c95821 --- /dev/null +++ b/naf_rs/src/ennaf.rs @@ -0,0 +1,79 @@ +use nafcodec::{EncoderBuilder,Memory,Flag,Record}; +use std::borrow::Cow; +use std::fs::File; +use std::io::BufWriter; +use std::str; +use bio::io::{fasta,fastq}; +use crate::EnnafArgs; +use crate::FileFormat; +use crate::SequenceTypeArg; + +pub fn encode_file(args: &EnnafArgs) { + let all_flags = args.format.get_flags_for_format() | Flag::Length | + { if !args.no_mask && (args.sequence == SequenceTypeArg::Dna || args.sequence == SequenceTypeArg::Rna) + {Flag::Mask} + else + {Flag::Length} + } | + { if let Some(_) = args.title + {Flag::Title} + else + {Flag::Length} + }; + let mut encoder = EncoderBuilder::from_flags(args.sequence.into_codec_type(), all_flags).with_storage(Memory).unwrap(); + if let Some(title) = &args.title { + encoder.push_title(title); + } + match args.format { + FileFormat::Fasta => { + let fasta_file = File::open(args.filename.to_owned()).unwrap(); + let fasta_reader = fasta::Reader::new(fasta_file); + for results in fasta_reader.records() { + let record = results.unwrap(); + let r = Record{ + id:Some(Cow::from(record.id())), + comment:{if let Some(desc) = record.desc() { + Some(Cow::from(desc)) + } else { + None + }}, + sequence:Some(Cow::from(record.seq().to_vec())), + length: Some(record.seq().len() as u64), + quality: None + }; + println!{"Parsed record {:?}",r}; + if let Err(e) = encoder.push(&r) { + panic!{"Could not push record {:?} to encoder {}",r,e}; + } + } + }, + FileFormat::Fastq => { + let fastq_file = File::open(args.filename.to_owned()).unwrap(); + let fastq_reader = fastq::Reader::new(fastq_file); + for results in fastq_reader.records() { + let record = results.unwrap(); + let r = Record{ + id:Some(Cow::from(record.id())), + comment:{if let Some(desc) = record.desc() { + Some(Cow::from(desc)) + } else { + None + }}, + sequence:Some(Cow::from(record.seq())), + length: Some(record.seq().len() as u64), + quality: Some(Cow::from(str::from_utf8(record.qual()).unwrap())) + }; + if let Err(e) = encoder.push(&r) { + panic!{"Could not push record {:?} to encoder {}",r,e}; + } + } + } + } + match &args.output { + Some(outfile) => encoder.write(BufWriter::new(File::create(outfile.to_owned()).unwrap())), + + None => encoder.write(BufWriter::new(std::io::stdout())) + }; +} + + diff --git a/naf_rs/src/main.rs b/naf_rs/src/main.rs new file mode 100644 index 0000000..ee68816 --- /dev/null +++ b/naf_rs/src/main.rs @@ -0,0 +1,174 @@ +use clap::{Parser, Subcommand, ValueEnum}; +use nafcodec::{SequenceType,Flag,Flags}; +use std::fmt; + +mod ennaf; +mod unnaf; + +#[derive(Debug,Subcommand)] +enum Process { + ENNAF ( EnnafArgs ), + UNNAF ( UnnafArgs ) +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] +enum FileFormat { + Fasta, + Fastq +} + +impl fmt::Display for FileFormat { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result{ + match self { + FileFormat::Fasta => write!(f, "FASTA"), + FileFormat::Fastq => write!(f, "FASTQ") + } + } +} + +impl FileFormat { + fn get_flags_for_format(&self) -> Flags { + match self{ + FileFormat::Fasta => Flag::Sequence | Flag::Id | Flag::Comment, + FileFormat::Fastq => Flag::Sequence | Flag::Id | Flag::Quality | Flag::Comment, + } + } +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] +enum SequenceTypeArg { + Dna, + Rna, + Protein, + Text +} + +impl fmt::Display for SequenceTypeArg { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result{ + match self { + SequenceTypeArg::Dna => write!(f, "DNA"), + SequenceTypeArg::Rna => write!(f, "RNA"), + SequenceTypeArg::Protein => write!(f, "protein"), + SequenceTypeArg::Text => write!(f, "text") + } + } +} + +impl SequenceTypeArg { + fn into_codec_type(&self) -> SequenceType { + match self { + SequenceTypeArg::Dna => SequenceType::Dna, + SequenceTypeArg::Rna => SequenceType::Rna, + SequenceTypeArg::Text => SequenceType::Text, + SequenceTypeArg::Protein => SequenceType::Protein + } + } +} + +#[derive(Parser,Debug)] +#[command(name="nafrs",version = "0.4", about = "Encode/decode NAF files", long_about = None)] +struct Args { + #[command(subcommand)] + process: Process +} + +#[derive(Debug,Parser)] +struct EnnafArgs { + #[arg(short,long,value_name="FILE",help="Write compressed output to FILE -- write to STDOUT if not specified")] + output: Option, + #[arg(short='#',long,value_name="N",default_value_t=1,help="Use compression level N (from -131072 to 22)")] + level: u16, + #[arg(long,value_name="N",default_value_t=11,help="Use window size 2^N for sequence stream (from 10 to 31)")] + long: u8, + #[arg(long,value_name="DIR",env="TMP",help="Use DIR as temporary directory, (overrides TMP environment variable)")] + temp_dir: Option, + #[arg(long,value_name="NAME",help="Use NAME as prefix for temporary files")] + name: Option, + #[arg(long,value_name="TITLE",help="Use TITLE as dataset title")] + title: Option, + #[arg(short,long,value_name="FORMAT",help="Input file type",default_value_t=FileFormat::Fasta)] + format: FileFormat, + #[arg(short,long,value_name="SEQTYPE",help="Input sequence type",default_value_t=SequenceTypeArg::Dna)] + sequence: SequenceTypeArg, + #[arg(long,default_value_t=false,help="Fail on unexpected input characters")] + strict: bool, + #[arg(long,value_name="N",default_value_t=80,help="Override line length to N")] + line_length: u16, + #[arg(long,default_value_t=false,help="Verbose mode")] + verbose: bool, + #[arg(long,default_value_t=false,help="Keep temporary files")] + keep_temp_files: bool, + #[arg(long,default_value_t=false,help="Don't store mask")] + no_mask: bool, + filename: String +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)] +enum UnnafOutput { + Format, // file format & version + PartList, // list of parts + Sizes, // size of parts (original, compressed Band ratio) + Number, // number of records + Title, // frame title + Ids, // record IDs (first part of FASTA name) + Names, // full FASTA names + Lengths, // sequence lengths + TotalLength, // sum of sequence lengths + Mask, // mask regions + FourBit, // four-bit encoded sequences + Seq, // all sequences concatenated (no newlines) + Sequences, // sequences separated by newlines + Fasta, // FASTA file output + Fastq // FASTQ file output +} + +impl fmt::Display for UnnafOutput { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result{ + match self { + UnnafOutput::Format => write!(f,"Format"), + UnnafOutput::PartList => write!(f,"Part List"), + UnnafOutput::Sizes => write!(f,"Sizes"), + UnnafOutput::Number => write!(f,"Number"), + UnnafOutput::Title => write!(f,"Title"), + UnnafOutput::Ids => write!(f,"IDs"), + UnnafOutput::Names => write!(f,"Names"), + UnnafOutput::Lengths => write!(f,"Lengths"), + UnnafOutput::TotalLength => write!(f,"Total Lengths"), + UnnafOutput::Mask => write!(f,"Mask"), + UnnafOutput::FourBit => write!(f,"Four Bit"), + UnnafOutput::Seq => write!(f,"Seq"), + UnnafOutput::Sequences => write!(f,"Sequences"), + UnnafOutput::Fasta => write!(f,"FASTA"), + UnnafOutput::Fastq => write!(f,"FASTQ") + } + } +} + +#[derive(Debug,Parser)] +struct UnnafArgs { + #[arg(short,long,value_name="FILE",help="Write uncompressed output to FILE -- read to STDOUT if not specified")] + output: Option, + #[arg(short='t',long,default_value_t=UnnafOutput::Fasta)] + output_type: UnnafOutput, + #[arg(long,value_name="N",default_value_t=80,help="Override line length to N")] + line_length: u16, + #[arg(long,default_value_t=false,help="Ignore Mask")] + no_mask: bool, + #[arg(long,default_value_t=false,help="Set STDOUT stream to binary mode")] + binary_stdout: bool, + #[arg(long,default_value_t=false,help="Set STDERR stream to binary mode")] + binary_stderr: bool, + filename: String +} + +fn main() { + let args=Args::parse(); + match args.process { + Process::ENNAF (ennaf_args) => { + ennaf::encode_file(&ennaf_args); + } + Process::UNNAF (unnaf_args) => { + unnaf::decode_naf(&unnaf_args); + } + }; +} diff --git a/naf_rs/src/unnaf.rs b/naf_rs/src/unnaf.rs new file mode 100644 index 0000000..c42bddf --- /dev/null +++ b/naf_rs/src/unnaf.rs @@ -0,0 +1,108 @@ +use nafcodec::{DecoderBuilder,Flag,Flags}; + +use std::path::Path; +use bio::io::{fasta,fastq}; +use crate::UnnafArgs; + +pub fn decode_naf(args: &UnnafArgs) { + let flags = match args.output_type { + crate::UnnafOutput::Format => Flags::new(), + crate::UnnafOutput::PartList => Flags::new(), + crate::UnnafOutput::Sizes => Flags::new(), + crate::UnnafOutput::Number => Flags::from(Flag::Sequence), + crate::UnnafOutput::Lengths => Flags::from(Flag::Length), + crate::UnnafOutput::TotalLength => Flags::from(Flag::Length), + crate::UnnafOutput::Title => Flags::from(Flag::Title), + crate::UnnafOutput::Ids => Flags::from(Flag::Id), + crate::UnnafOutput::Names => Flags::from(Flag::Comment), + crate::UnnafOutput::Mask => Flags::from(Flag::Mask), + crate::UnnafOutput::FourBit | + crate::UnnafOutput::Seq | + crate::UnnafOutput::Sequences => Flags::from(Flag::Sequence), + crate::UnnafOutput::Fasta => Flag::Comment | Flag::Sequence | {if args.no_mask {Flag::Mask} else {Flag::Sequence}}, + crate::UnnafOutput::Fastq => Flag::Comment | Flag::Sequence | Flag::Quality | {if args.no_mask {Flag::Mask} else {Flag::Sequence}}, + }; + let filepath = Path::new(&args.filename); + let mut decoder = DecoderBuilder::from_flags(flags).with_path(filepath).unwrap(); + // FIXME: Write to args.output instead of using println! + match args.output_type{ + crate::UnnafOutput::Format => { + let header = decoder.header(); + println!("{:?} sequences in NAF format {:?}",header.sequence_type(),header.format_version()); + }, + crate::UnnafOutput::PartList => println!("{:?}",decoder.header().flags()), + crate::UnnafOutput::Sizes => { + let all_flags = decoder.header().flags(); + let sizes_decoder = DecoderBuilder::from_flags(all_flags).sizes_from_path(filepath).unwrap(); + for size in sizes_decoder { + println!("{}",size); + } + }, + crate::UnnafOutput::Lengths => println!("{:?}",decoder.lengths()), + crate::UnnafOutput::TotalLength => println!("{:?}",decoder.lengths().iter().sum::()), + crate::UnnafOutput::Number => println!("{:?}",decoder.lengths().len()), + crate::UnnafOutput::Title => println!("{}",decoder.title().unwrap()), + crate::UnnafOutput::Ids => { + for record in decoder { + println!("{}",record.unwrap().id.unwrap()); + } + }, + crate::UnnafOutput::Names => { + for record in decoder { + if let Ok(ok_rec) = record { + println!("{} {}",ok_rec.id.unwrap(),ok_rec.comment.unwrap()); + } + } + }, + crate::UnnafOutput::Mask => todo!(), + crate::UnnafOutput::FourBit => todo!(), + crate::UnnafOutput::Seq => { + for record in decoder { + if let Ok(ok_rec) = record { + print!("{}",std::str::from_utf8(&ok_rec.sequence.unwrap()).unwrap()); + } + } + }, + crate::UnnafOutput::Sequences => { + for record in decoder { + if let Ok(ok_rec) = record { + println!("{}",std::str::from_utf8(&ok_rec.sequence.unwrap()).unwrap()); + } + } + }, + crate::UnnafOutput::Fasta => { + for record in decoder { + if let Ok(ok_rec) = record { + let seq = ok_rec.sequence.unwrap(); + let id = ok_rec.id.unwrap().clone(); + let comment = ok_rec.comment.as_deref().clone(); + let fasta_record = fasta::Record::with_attrs( + &id, + comment, + &seq); + // FIXME: does not wrap lines + print!("{}",fasta_record); + } + + } + }, + crate::UnnafOutput::Fastq => + for record in decoder { + if let Ok(ok_rec) = record { + let seq = ok_rec.sequence.unwrap(); + let id = ok_rec.id.unwrap().clone(); + let comment = ok_rec.comment.as_deref().clone(); + let qual = ok_rec.quality.expect("FASTQ output requested, but input has no qualities"); + let fastq_record = fastq::Record::with_attrs( + &id, + comment, + &seq, + qual.as_bytes()); + // FIXME: does not wrap lines + print!("{}",fastq_record); + } + + } + } + +} diff --git a/nafcodec/Cargo.toml b/nafcodec/Cargo.toml index 78bf7e5..ab40e5f 100644 --- a/nafcodec/Cargo.toml +++ b/nafcodec/Cargo.toml @@ -12,6 +12,8 @@ keywords = ["nucleotide", "archive", "biology", "bioinformatics"] categories = ["science", "parser-implementations", "compression"] [dependencies] +cfg-if = "1.0.0" +log = "0.4.22" nom = "7.1.3" [dependencies.zstd] version = "0.13.1" @@ -24,6 +26,8 @@ optional = true default = ["tempfile"] arc = [] nightly = [] +lut = [] +simd = ["lut"] [[test]] name = "decode" diff --git a/nafcodec/src/data.rs b/nafcodec/src/data.rs index d6fbc40..d4a7e92 100644 --- a/nafcodec/src/data.rs +++ b/nafcodec/src/data.rs @@ -5,6 +5,7 @@ use std::borrow::Cow; use std::ops::BitOr; use std::ops::BitOrAssign; +use std::fmt; /// A single masked unit with associated status decoded from the mask block. #[derive(Debug, Clone, PartialEq)] @@ -32,13 +33,16 @@ pub struct Record<'a> { /// The record comment (description). pub comment: Option>, /// The record sequence. - pub sequence: Option>, + pub sequence: Option>, /// The record quality string. pub quality: Option>, /// The record sequence length. pub length: Option, } + +// TODO: implement FASTA/FASTQ IO for Record + // --- FormatVersion ----------------------------------------------------------- /// The supported format versions inside NAF archives. @@ -55,15 +59,20 @@ pub enum FormatVersion { #[derive(Debug, Clone, Copy, Default, PartialEq)] pub enum SequenceType { #[default] + /// DNA Sequence - ATCG(N-) Dna = 0, + /// RNA Sequence - AUCG(N-) Rna = 1, + /// Protein Sequence - single character amino acid Protein = 2, + /// Test Sequence - arbitrary string Text = 3, } impl SequenceType { /// Check whether the sequence type is a nucleotide type. #[inline] + #[must_use] pub const fn is_nucleotide(&self) -> bool { match self { Self::Dna | Self::Rna => true, @@ -98,6 +107,7 @@ pub enum Flag { impl Flag { /// Get all individual flags. + #[must_use] pub const fn values() -> &'static [Self] { &[ Flag::Quality, @@ -112,6 +122,7 @@ impl Flag { } /// View the flag as a single byte mask. + #[must_use] pub const fn as_byte(&self) -> u8 { *self as u8 } @@ -132,11 +143,13 @@ pub struct Flags(u8); impl Flags { /// Create new `Flags` with all flags unset. + #[must_use] pub const fn new() -> Self { Self(0) } /// Check if the given flag is set. + #[must_use] pub const fn test(&self, flag: Flag) -> bool { (self.0 & flag as u8) != 0 } @@ -152,6 +165,7 @@ impl Flags { } /// View the flags as a single byte. + #[must_use] pub const fn as_byte(&self) -> u8 { self.0 } @@ -206,31 +220,37 @@ pub struct Header { impl Header { /// Get the flags of the archive header. + #[must_use] pub const fn flags(&self) -> Flags { self.flags } /// Get the default line length stored in the archive. + #[must_use] pub const fn line_length(&self) -> u64 { self.line_length } /// Get the name separator used in the archive. + #[must_use] pub const fn name_separator(&self) -> char { self.name_separator } /// Get the number of sequences stored in the archive. + #[must_use] pub const fn number_of_sequences(&self) -> u64 { self.number_of_sequences } /// Get the type of sequences stored in the archive. + #[must_use] pub const fn sequence_type(&self) -> SequenceType { self.sequence_type } /// Get the archive format version. + #[must_use] pub const fn format_version(&self) -> FormatVersion { self.format_version } @@ -248,3 +268,34 @@ impl Default for Header { } } } + +//---- Size --------------------------------------------------------------------- + +#[derive(Debug, Clone)] +pub struct Size { + block: String, + pub original: u64, + pub compressed: u64 +} + +impl Size { + pub fn new(block: String, original: u64, compressed: Option) -> Self { + let compressed_real = if let Some(real_size) = compressed {real_size} else { original }; + Size{block, original, compressed: compressed_real} + } +} + +#[allow(clippy::cast_precision_loss)] +impl fmt::Display for Size { + fn fmt(&self, f:&mut fmt::Formatter) -> fmt::Result { + if self.original == self.compressed { + write!(f, "{}: {}",self.block,self.original) + } else { + write!(f, "{}: {} / {} ({:.3}%)", + self.block, + self.compressed, + self.original, + (self.compressed as f32 * 100.0)/(self.original as f32)) + } + } +} diff --git a/nafcodec/src/decoder/mod.rs b/nafcodec/src/decoder/mod.rs index 5c36948..f0d15fe 100644 --- a/nafcodec/src/decoder/mod.rs +++ b/nafcodec/src/decoder/mod.rs @@ -1,5 +1,4 @@ use std::borrow::Cow; -use std::fmt::Debug; use std::fs::File; use std::io::BufRead; use std::io::BufReader; @@ -9,6 +8,7 @@ use std::io::SeekFrom; use std::iter::FusedIterator; use std::path::Path; use std::sync::RwLock; +use std::str; mod ioslice; mod parser; @@ -26,6 +26,7 @@ use crate::data::Header; use crate::data::MaskUnit; use crate::data::Record; use crate::data::SequenceType; +use crate::data::Size; use crate::error::Error; /// The wrapper used to decode Zstandard stream. @@ -49,7 +50,7 @@ type ZstdDecoder<'z, R> = BufReader>>>; /// println!(">{}", record.id.unwrap()); /// } /// ``` -#[derive(Debug, Clone)] +//#[derive(Debug, Clone)] pub struct DecoderBuilder { buffer_size: usize, id: bool, @@ -57,6 +58,7 @@ pub struct DecoderBuilder { sequence: bool, quality: bool, mask: bool, + title: bool, } impl DecoderBuilder { @@ -72,6 +74,7 @@ impl DecoderBuilder { sequence: true, quality: true, mask: true, + title: true } } @@ -97,6 +100,7 @@ impl DecoderBuilder { builder.sequence(flags.test(Flag::Sequence)); builder.mask(flags.test(Flag::Mask)); builder.comment(flags.test(Flag::Comment)); + builder.title(flags.test(Flag::Title)); builder } @@ -147,6 +151,13 @@ impl DecoderBuilder { self } + /// Whether or not Title is in NAF frame + #[inline] + pub fn title(&mut self, title: bool) -> &mut Self { + self.title = title; + self + } + /// Consume the builder to get a decoder reading data from the given buffer. pub fn with_bytes<'data, 'z>( &self, @@ -165,17 +176,85 @@ impl DecoderBuilder { .and_then(|f| self.with_reader(std::io::BufReader::new(f))) } + /// Consume the builder and get the sizes from the archive from a given path + pub fn sizes_from_path>( + &self, + path: P + ) -> Result,Error> { + File::open(path.as_ref()) + .map_err(Error::from) + .and_then(|f| self.sizes_from_reader(std::io::BufReader::new(f))) + } + + /// Consume the builder and get the sizes from the archive + pub fn sizes_from_reader(&self, mut reader: R) -> Result,Error> { + let buffer = reader.fill_buf()?; + let naf_header = match self::parser::header(buffer) { + Ok((i, naf_header)) => { + let consumed = buffer.len() - i.len(); + reader.consume(consumed); + naf_header + } + Err(nom::Err::Incomplete(_)) => { + return Err(Error::from(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "failed to read header", + ))); + } + Err(nom::Err::Error(e) | nom::Err::Failure(e)) => { + return Err(Error::from(e)); + } + }; + macro_rules! extract_sizes { + ($flags:expr, $flag:ident, $use_block:expr, $rc:ident, $size:ident, $name:expr) => { + let $size; + if $flags.test(Flag::$flag) { + let mut handle = $rc.write().unwrap(); + // decode the block size + let buf = handle.fill_buf()?; + let (i, original_size) = self::parser::variable_u64(buf)?; + let (i2,compressed_size) = self::parser::variable_u64(i)?; + let block_size; + $size = if Flag::$flag == Flag::Title { + block_size = (buf.len()-i.len()+original_size as usize); + Some(Size::new($name,original_size,None)) + } else { + block_size = (buf.len()-i2.len()+compressed_size as usize); + Some(Size::new($name,original_size,Some(compressed_size))) + }; + handle.consume(block_size); + } else { + $size = None; + } + }; + } + let rc = Rc::new(RwLock::new(reader)); + let flags = naf_header.flags(); + extract_sizes!(flags, Title, self.title, rc, title_size,"Title".to_owned()); + extract_sizes!(flags, Id, self.id, rc, ids_size,"IDs".to_owned()); + extract_sizes!(flags, Comment, self.comment, rc, com_size,"Name".to_owned()); + extract_sizes!(flags, Length, true, rc, len_size,"Length".to_owned()); + extract_sizes!(flags, Mask, self.mask, rc, mask_size,"Mask".to_owned()); + extract_sizes!(flags, Sequence, self.sequence, rc, seq_size, "Sequence".to_owned()); + extract_sizes!(flags, Quality, self.quality, rc, quality_size, "Quality".to_owned()); + let retvec: Vec> = vec![title_size,ids_size,com_size,len_size,mask_size,seq_size,quality_size]; + Ok(retvec + .iter() + .filter_map(|x| x.clone()) + .collect()) + } + /// Consume the builder to get a decoder reading data from `reader`. pub fn with_reader<'z, R: BufRead + Seek>( &self, mut reader: R, ) -> Result, Error> { let buffer = reader.fill_buf()?; - let header = match self::parser::header(buffer) { - Ok((i, header)) => { + let naf_header = match self::parser::header(buffer) { + Ok((i, naf_header)) => { let consumed = buffer.len() - i.len(); reader.consume(consumed); - header + naf_header } Err(nom::Err::Incomplete(_)) => { return Err(Error::from(std::io::Error::new( @@ -188,12 +267,16 @@ impl DecoderBuilder { } }; - if header.flags().test(Flag::Title) { + let title = if naf_header.flags().test(Flag::Title) { let buf = reader.fill_buf()?; let (i, _title) = self::parser::title(buf)?; let consumed = buf.len() - i.len(); + let ret = Some(_title.to_owned()); reader.consume(consumed); - } + ret + } else { + None + }; let rc = Rc::new(RwLock::new(reader)); macro_rules! setup_block { @@ -232,8 +315,21 @@ impl DecoderBuilder { }; } - let flags = header.flags(); + let flags = naf_header.flags(); let mut seqlen = 0; + /* + let title = if flags.test(Flag::Title) && self.title { + let mut handle = rc.write().unwrap(); + let buf = handle.fill_buf()?; + let (i, original_size) = self::parser::variable_u64(buf)?; + let consumed = buf.len() - i.len(); + println!("{}",original_size); + println!("{:?}",&i[0..original_size as usize]); + let ret = Some(str::from_utf8(&i[consumed..original_size as usize])?.to_owned()); + handle.consume(consumed); + handle.seek(SeekFrom::Current(original_size as i64))?; + ret + } else { None };*/ setup_block!(flags, Id, self.id, rc, ids_block); setup_block!(flags, Comment, self.comment, rc, com_block); setup_block!(flags, Length, true, rc, len_block); @@ -245,11 +341,12 @@ impl DecoderBuilder { ids: ids_block.map(CStringReader::new), com: com_block.map(CStringReader::new), len: len_block.map(LengthReader::new), - seq: seq_block.map(|x| SequenceReader::new(x, header.sequence_type())), + seq: seq_block.map(|x| SequenceReader::new(x, naf_header.sequence_type())), qual: quality_block.map(|x| SequenceReader::new(x, SequenceType::Text)), mask: mask_block.map(|x| MaskReader::new(x, seqlen)), + title, n: 0, - header, + header:naf_header, reader: rc, unit: MaskUnit::Unmasked(0), }) @@ -271,7 +368,7 @@ impl Default for DecoderBuilder { /// /// By default, the decoder will decode all available fields, which may not /// be needed. Use a [`DecoderBuilder`] to configure decoding of individual -/// fields. +/// fields. /// /// # Thread safety /// @@ -291,6 +388,7 @@ pub struct Decoder<'z, R: BufRead + Seek> { seq: Option>>, qual: Option>>, mask: Option>>, + title: Option, n: usize, unit: MaskUnit, } @@ -327,6 +425,17 @@ impl Decoder<'_, R> { &self.header } + /// Get the Title of the archive. + /// + /// The first NAF block is the title (provided the Title flag is set). + pub fn title(&self) -> Result<&str, Error> { + if let Some(title) = &self.title { + Ok(title) + } else { + panic!("No title in NAF Archive") + } + } + /// Get the type of sequence in the archive being decoded. /// /// This method is a shortcut for `self.header().sequence_type()`. @@ -349,6 +458,17 @@ impl Decoder<'_, R> { .expect("lock shouldn't be poisoned") } + /// Return sequence lengths as a Vec + /// + /// Iterates through the lengths block, extracting all values. + pub fn lengths(&mut self) -> Vec { + if let Some(l) = &mut self.len { + l.into_iter().map(|x| x.unwrap()).collect::>()[1..].to_vec() + } else { + vec![] + } + } + /// Attempt to read the next record from the archive. /// /// This function expects that a record is available; use `Decoder::next` @@ -368,7 +488,7 @@ impl Decoder<'_, R> { .map(|com| com.into_string().map(Cow::Owned).expect("TODO")); let length = self.len.as_mut().and_then(|r| r.next()).transpose()?; - let mut sequence: Option> = None; + let mut sequence: Option> = None; let mut quality = None; if let Some(l) = length { sequence = self @@ -376,7 +496,7 @@ impl Decoder<'_, R> { .as_mut() .map(|r| r.next(l)) .transpose()? - .map(Cow::Owned); + .map(|com| Cow::from(com.as_bytes().to_owned())); quality = self .qual .as_mut() @@ -399,7 +519,7 @@ impl Decoder<'_, R> { } /// Attempt to mask some regions of the given sequence. - fn mask_sequence(&mut self, sequence: &mut str) -> Result<(), Error> { + fn mask_sequence(&mut self, sequence: &mut [u8]) -> Result<(), Error> { let mut mask = self.unit.clone(); let mut seq = sequence; diff --git a/nafcodec/src/decoder/parser.rs b/nafcodec/src/decoder/parser.rs index 3faaa42..d8d4b01 100644 --- a/nafcodec/src/decoder/parser.rs +++ b/nafcodec/src/decoder/parser.rs @@ -30,11 +30,11 @@ pub fn variable_u64(i: &[u8]) -> IResult<&[u8], u64> { let mut num = 0; let mut basis = 1; - num += ((last & 0x7F) as u64) * basis; + num += u64::from(last & 0x7F) * basis; basis *= 128; for &limb in limbs.iter().rev() { - if let Some(x) = num.checked_add(((limb & 0x7F) as u64) * basis) { + if let Some(x) = num.checked_add(u64::from(limb & 0x7F) * basis) { num = x; basis *= 128; } else { @@ -134,7 +134,6 @@ pub fn title(i: &[u8]) -> IResult<&[u8], &str> { } mod tests { - #[test] fn header() { const HEADER: [u8; 8] = [0x01, 0xF9, 0xEC, 0x01, 0x3E, 0x20, 0x3C, 0x20]; diff --git a/nafcodec/src/decoder/reader.rs b/nafcodec/src/decoder/reader.rs index e79f572..81304a5 100644 --- a/nafcodec/src/decoder/reader.rs +++ b/nafcodec/src/decoder/reader.rs @@ -1,9 +1,25 @@ use std::ffi::CString; use std::io::BufRead; +use std::io::ErrorKind; +use log::{warn,debug}; use crate::data::MaskUnit; use crate::data::SequenceType; - +#[cfg(all(target_arch="x86_64",feature="simd"))] +use core::arch::x86_64::{ + __cpuid_count, + _mm_storeu_si128, + _mm_set_epi8, + _mm_set1_epi32, + _mm_set1_epi8, + _mm_shuffle_epi8, + _mm_srlv_epi32, + __m128i, + _mm_loadu_si128, + _mm_unpackhi_epi8, + _mm_unpacklo_epi8, + _mm_and_si128, +}; // --- CStringReader ----------------------------------------------------------- #[derive(Debug)] @@ -69,6 +85,7 @@ impl Iterator for LengthReader { // --- SequenceReader ---------------------------------------------------------- + #[derive(Debug)] pub struct SequenceReader { reader: R, @@ -89,8 +106,13 @@ impl SequenceReader { let l = length as usize; if self.ty.is_nucleotide() { let mut sequence = String::with_capacity(l); - if self.cache.is_some() && l > 0 { - sequence.push(self.cache.take().unwrap()); + if l > 0 { + if let Some(_) = self.cache { + match self.cache.take() { + Some(cache_take) => sequence.push(cache_take), + None => return Err(std::io::Error::new(ErrorKind::UnexpectedEof,"Could not find next record in cache")) + } + } } while sequence.len() < l { match self.ty { @@ -125,17 +147,50 @@ impl SequenceReader { ) -> Result<(), std::io::Error> { let buffer = self.reader.fill_buf()?; - let rem = length as usize - sequence.len(); - let n = buffer.len().min(rem / 2); + let rem = length - sequence.len(); + let n = buffer.len().min(rem/2); - for x in buffer.iter().take(n) { - let c1 = Self::decode::(x & 0x0F); - sequence.push(c1); - let c2 = Self::decode::(x >> 4); - sequence.push(c2); + // decode the bulk of the characters + cfg_if::cfg_if!{ + if #[cfg(feature="simd")] { + let mut offset = 0; + for i in 0..(n-(n%16))/16{ + let mut simd_buf: [u8;16] = [0;16]; + for (j,x) in buffer[i*16..(i+1)*16].iter().take(16).enumerate() { + simd_buf[j] = *x; + } + if let Ok(seq_buf) = Self::decode_simd::(simd_buf) { + sequence.push_str(String::from_utf8(seq_buf.iter().map(|x| *x).collect::>()).unwrap().as_str()); + offset += 16; + } else { + warn!("SIMD Decoding failed, using LUT"); + break; + } + } + debug!("{:?} bytes of {:?} read, switching to LUT to parse next {}",offset,rem,buffer[offset..n].len()); + for x in buffer[offset..n].iter().take(n-offset) { + let c = Self::decode_lut::(*x); + sequence.push(c[0]); + sequence.push(c[1]); + } + } else if #[cfg(feature="lut")] { + warn!("Parsing sequence using LUT"); + for x in buffer.iter().take(n) { + let c = Self::decode_lut::(*x); + sequence.push(c[0]); + sequence.push(c[1]); + } + } else { + for x in buffer.iter().take(n) { + let c1 = Self::decode::(x & 0x0F); + sequence.push(c1); + let c2 = Self::decode::(x >> 4); + sequence.push(c2); + } + } } - if n < buffer.len() && sequence.len() == length as usize - 1 { + if n < buffer.len() && sequence.len() == length - 1 { let c1 = Self::decode::(buffer[n] & 0x0F); sequence.push(c1); let c2 = Self::decode::(buffer[n] >> 4); @@ -148,6 +203,170 @@ impl SequenceReader { Ok(()) } + #[cfg(feature="lut")] + #[inline] + fn decode_lut(c: u8) -> [char;2] { + const CHARS_LUT_U: [[char;2];256] = [ + ['-','-'], ['U','-'], ['G','-'], ['K','-'], ['C','-'], ['Y','-'], ['S','-'], + ['B','-'], ['A','-'], ['W','-'], ['R','-'], ['D','-'], ['M','-'], ['H','-'], + ['V','-'], ['N','-'], ['-','U'], ['U','U'], ['G','U'], ['K','U'], ['C','U'], + ['Y','U'], ['S','U'], ['B','U'], ['A','U'], ['W','U'], ['R','U'], ['D','U'], + ['M','U'], ['H','U'], ['V','U'], ['N','U'], ['-','G'], ['U','G'], ['G','G'], + ['K','G'], ['C','G'], ['Y','G'], ['S','G'], ['B','G'], ['A','G'], ['W','G'], + ['R','G'], ['D','G'], ['M','G'], ['H','G'], ['V','G'], ['N','G'], ['-','K'], + ['U','K'], ['G','K'], ['K','K'], ['C','K'], ['Y','K'], ['S','K'], ['B','K'], + ['A','K'], ['W','K'], ['R','K'], ['D','K'], ['M','K'], ['H','K'], ['V','K'], + ['N','K'], ['-','C'], ['U','C'], ['G','C'], ['K','C'], ['C','C'], ['Y','C'], + ['S','C'], ['B','C'], ['A','C'], ['W','C'], ['R','C'], ['D','C'], ['M','C'], + ['H','C'], ['V','C'], ['N','C'], ['-','Y'], ['U','Y'], ['G','Y'], ['K','Y'], + ['C','Y'], ['Y','Y'], ['S','Y'], ['B','Y'], ['A','Y'], ['W','Y'], ['R','Y'], + ['D','Y'], ['M','Y'], ['H','Y'], ['V','Y'], ['N','Y'], ['-','S'], ['U','S'], + ['G','S'], ['K','S'], ['C','S'], ['Y','S'], ['S','S'], ['B','S'], ['A','S'], + ['W','S'], ['R','S'], ['D','S'], ['M','S'], ['H','S'], ['V','S'], ['N','S'], + ['-','B'], ['U','B'], ['G','B'], ['K','B'], ['C','B'], ['Y','B'], ['S','B'], + ['B','B'], ['A','B'], ['W','B'], ['R','B'], ['D','B'], ['M','B'], ['H','B'], + ['V','B'], ['N','B'], ['-','A'], ['U','A'], ['G','A'], ['K','A'], ['C','A'], + ['Y','A'], ['S','A'], ['B','A'], ['A','A'], ['W','A'], ['R','A'], ['D','A'], + ['M','A'], ['H','A'], ['V','A'], ['N','A'], ['-','W'], ['U','W'], ['G','W'], + ['K','W'], ['C','W'], ['Y','W'], ['S','W'], ['B','W'], ['A','W'], ['W','W'], + ['R','W'], ['D','W'], ['M','W'], ['H','W'], ['V','W'], ['N','W'], ['-','R'], + ['U','R'], ['G','R'], ['K','R'], ['C','R'], ['Y','R'], ['S','R'], ['B','R'], + ['A','R'], ['W','R'], ['R','R'], ['D','R'], ['M','R'], ['H','R'], ['V','R'], + ['N','R'], ['-','D'], ['U','D'], ['G','D'], ['K','D'], ['C','D'], ['Y','D'], + ['S','D'], ['B','D'], ['A','D'], ['W','D'], ['R','D'], ['D','D'], ['M','D'], + ['H','D'], ['V','D'], ['N','D'], ['-','M'], ['U','M'], ['G','M'], ['K','M'], + ['C','M'], ['Y','M'], ['S','M'], ['B','M'], ['A','M'], ['W','M'], ['R','M'], + ['D','M'], ['M','M'], ['H','M'], ['V','M'], ['N','M'], ['-','H'], ['U','H'], + ['G','H'], ['K','H'], ['C','H'], ['Y','H'], ['S','H'], ['B','H'], ['A','H'], + ['W','H'], ['R','H'], ['D','H'], ['M','H'], ['H','H'], ['V','H'], ['N','H'], + ['-','V'], ['U','V'], ['G','V'], ['K','V'], ['C','V'], ['Y','V'], ['S','V'], + ['B','V'], ['A','V'], ['W','V'], ['R','V'], ['D','V'], ['M','V'], ['H','V'], + ['V','V'], ['N','V'], ['-','N'], ['U','N'], ['G','N'], ['K','N'], ['C','N'], + ['Y','N'], ['S','N'], ['B','N'], ['A','N'], ['W','N'], ['R','N'], ['D','N'], + ['M','N'], ['H','N'], ['V','N'], ['N','N'] + ]; + const CHARS_LUT_T: [[char;2];256] = [ + ['-','-'], ['T','-'], ['G','-'], ['K','-'], ['C','-'], ['Y','-'], ['S','-'], + ['B','-'], ['A','-'], ['W','-'], ['R','-'], ['D','-'], ['M','-'], ['H','-'], + ['V','-'], ['N','-'], ['-','T'], ['T','T'], ['G','T'], ['K','T'], ['C','T'], + ['Y','T'], ['S','T'], ['B','T'], ['A','T'], ['W','T'], ['R','T'], ['D','T'], + ['M','T'], ['H','T'], ['V','T'], ['N','T'], ['-','G'], ['T','G'], ['G','G'], + ['K','G'], ['C','G'], ['Y','G'], ['S','G'], ['B','G'], ['A','G'], ['W','G'], + ['R','G'], ['D','G'], ['M','G'], ['H','G'], ['V','G'], ['N','G'], ['-','K'], + ['T','K'], ['G','K'], ['K','K'], ['C','K'], ['Y','K'], ['S','K'], ['B','K'], + ['A','K'], ['W','K'], ['R','K'], ['D','K'], ['M','K'], ['H','K'], ['V','K'], + ['N','K'], ['-','C'], ['T','C'], ['G','C'], ['K','C'], ['C','C'], ['Y','C'], + ['S','C'], ['B','C'], ['A','C'], ['W','C'], ['R','C'], ['D','C'], ['M','C'], + ['H','C'], ['V','C'], ['N','C'], ['-','Y'], ['T','Y'], ['G','Y'], ['K','Y'], + ['C','Y'], ['Y','Y'], ['S','Y'], ['B','Y'], ['A','Y'], ['W','Y'], ['R','Y'], + ['D','Y'], ['M','Y'], ['H','Y'], ['V','Y'], ['N','Y'], ['-','S'], ['T','S'], + ['G','S'], ['K','S'], ['C','S'], ['Y','S'], ['S','S'], ['B','S'], ['A','S'], + ['W','S'], ['R','S'], ['D','S'], ['M','S'], ['H','S'], ['V','S'], ['N','S'], + ['-','B'], ['T','B'], ['G','B'], ['K','B'], ['C','B'], ['Y','B'], ['S','B'], + ['B','B'], ['A','B'], ['W','B'], ['R','B'], ['D','B'], ['M','B'], ['H','B'], + ['V','B'], ['N','B'], ['-','A'], ['T','A'], ['G','A'], ['K','A'], ['C','A'], + ['Y','A'], ['S','A'], ['B','A'], ['A','A'], ['W','A'], ['R','A'], ['D','A'], + ['M','A'], ['H','A'], ['V','A'], ['N','A'], ['-','W'], ['T','W'], ['G','W'], + ['K','W'], ['C','W'], ['Y','W'], ['S','W'], ['B','W'], ['A','W'], ['W','W'], + ['R','W'], ['D','W'], ['M','W'], ['H','W'], ['V','W'], ['N','W'], ['-','R'], + ['T','R'], ['G','R'], ['K','R'], ['C','R'], ['Y','R'], ['S','R'], ['B','R'], + ['A','R'], ['W','R'], ['R','R'], ['D','R'], ['M','R'], ['H','R'], ['V','R'], + ['N','R'], ['-','D'], ['T','D'], ['G','D'], ['K','D'], ['C','D'], ['Y','D'], + ['S','D'], ['B','D'], ['A','D'], ['W','D'], ['R','D'], ['D','D'], ['M','D'], + ['H','D'], ['V','D'], ['N','D'], ['-','M'], ['T','M'], ['G','M'], ['K','M'], + ['C','M'], ['Y','M'], ['S','M'], ['B','M'], ['A','M'], ['W','M'], ['R','M'], + ['D','M'], ['M','M'], ['H','M'], ['V','M'], ['N','M'], ['-','H'], ['T','H'], + ['G','H'], ['K','H'], ['C','H'], ['Y','H'], ['S','H'], ['B','H'], ['A','H'], + ['W','H'], ['R','H'], ['D','H'], ['M','H'], ['H','H'], ['V','H'], ['N','H'], + ['-','V'], ['T','V'], ['G','V'], ['K','V'], ['C','V'], ['Y','V'], ['S','V'], + ['B','V'], ['A','V'], ['W','V'], ['R','V'], ['D','V'], ['M','V'], ['H','V'], + ['V','V'], ['N','V'], ['-','N'], ['T','N'], ['G','N'], ['K','N'], ['C','N'], + ['Y','N'], ['S','N'], ['B','N'], ['A','N'], ['W','N'], ['R','N'], ['D','N'], + ['M','N'], ['H','N'], ['V','N'], ['N','N'] + ]; + match T { + 'T' => CHARS_LUT_T[usize::from(c)], + 'U' => CHARS_LUT_U[usize::from(c)], + _ => unreachable!() + } + } + + #[cfg(all(target_arch="x86_64",feature="simd"))] + #[inline] + fn decode_simd_x86(inbuf: *const u8, output_lo: *mut [u8;16], output_hi: *mut [u8;16]) -> Result<(),std::io::Error>{ + unsafe { + // Lookup Vector + let lookup_vec: __m128i = _mm_set_epi8( // SSE2 + 'N' as i8, + 'V' as i8, + 'H' as i8, + 'M' as i8, + 'D' as i8, + 'R' as i8, + 'W' as i8, + 'A' as i8, + 'B' as i8, + 'S' as i8, + 'Y' as i8, + 'C' as i8, + 'K' as i8, + 'G' as i8, + T as i8, + '-' as i8, + ); + let shiftval = _mm_set1_epi32(4); // SSE2 + // 4bit mask for each byte + let lo_byte_vec: __m128i = _mm_set1_epi8(0x0f); // SSE2 + // encoded input buffer + let mut mmvec = _mm_loadu_si128(inbuf.cast()); // SSE2 + // shuffle only uses the bottom 4 bits (AND masked), get the low byte + let lobyte = _mm_shuffle_epi8(lookup_vec, _mm_and_si128(mmvec,lo_byte_vec)); // SSSE3 + // shift 4 bits to the right + mmvec = _mm_srlv_epi32(mmvec, shiftval); // AVX2 + // perform the shuffle again + let hibyte = _mm_shuffle_epi8(lookup_vec, _mm_and_si128(mmvec,lo_byte_vec)); // SSSE3 + // Unpack the low 8 bytes (interleaved) + let outvec2 = _mm_unpacklo_epi8(lobyte,hibyte); // SSE2 + // unpack the high 8 bytes (interleaved) + let outvec1 = _mm_unpackhi_epi8(lobyte,hibyte); // SSE2 + // store the vectors to the output arrays + _mm_storeu_si128(output_hi.cast(),outvec1); // SSE2 + _mm_storeu_si128(output_lo.cast(),outvec2); // SSE2 + } + Ok(()) + } + + #[cfg(feature="simd")] + #[inline] + fn decode_simd(inbuf: [u8;16]) -> Result<[u8;32],std::io::Error> { + #[cfg(not(target_arch="x86_64"))] // Add architectures here if adding + return Err(SomeError); + // guarantee that CPU has SSE2, SSSE3, and AVX2 + #[cfg(target_arch="x86_64")] + { + unsafe { + let feature_cpuid = __cpuid_count(1,0); + if __cpuid_count(7,0).ebx & (1<<5) == 0 || // AVX2 + feature_cpuid.edx & (1<<26) == 0 || // SSE2 + feature_cpuid.ecx & (1<<9) == 0 // SSSE3 + { + // this should result in a warning and failover from the calling function + return Err(std::io::Error::new(std::io::ErrorKind::Unsupported,"Not supported by CPU")); + } + } + let mut output_lo: [u8;16] = [0;16]; + let mut output_hi: [u8;16] = [0;16]; + let inbuf_slice = inbuf.as_ptr(); + Self::decode_simd_x86::(inbuf_slice,&mut output_lo,&mut output_hi)?; // caller fails + // on error + let mut outbuf = [0;32]; + for i in 0..32 { + outbuf[i] = if i<16 { output_lo[i] } else { output_hi[i-16] } + } + Ok(outbuf) + } + } + #[inline] fn decode(c: u8) -> char { match c { @@ -205,7 +424,7 @@ impl Iterator for MaskReader { let mut i = 0; let buf = match self.reader.fill_buf() { Err(e) => return Some(Err(e)), - Ok(buf) if buf.len() == 0 => break, + Ok([]) => break, Ok(buf) => buf, }; while i < buf.len() && buf[i] == 0xFF { diff --git a/nafcodec/src/encoder/mod.rs b/nafcodec/src/encoder/mod.rs index e76b1b0..2423d05 100644 --- a/nafcodec/src/encoder/mod.rs +++ b/nafcodec/src/encoder/mod.rs @@ -1,5 +1,6 @@ use std::io::Error as IoError; use std::io::Write; +use std::io::BufWriter; mod counter; mod storage; @@ -66,6 +67,8 @@ pub struct EncoderBuilder { sequence: bool, quality: bool, comment: bool, + mask: bool, + title: bool, compression_level: i32, } @@ -78,6 +81,8 @@ impl EncoderBuilder { quality: false, comment: false, sequence: false, + mask: false, + title: false, compression_level: 0, } } @@ -101,9 +106,25 @@ impl EncoderBuilder { builder.quality(flags.test(Flag::Quality)); builder.sequence(flags.test(Flag::Sequence)); builder.comment(flags.test(Flag::Comment)); + builder.mask(flags.test(Flag::Mask)); + builder.title(flags.test(Flag::Title)); builder } + /// Whether or not to encode the mask data + #[inline] + pub fn mask(&mut self, mask: bool) -> &mut Self{ + self.mask = mask; + self + } + + /// Whether or not NAF file has a title + #[inline] + pub fn title(&mut self, title: bool) -> &mut Self{ + self.title = title; + self + } + /// Whether or not to encode the identifier of each record. #[inline] pub fn id(&mut self, id: bool) -> &mut Self { @@ -161,9 +182,8 @@ impl EncoderBuilder { /// Consume the builder to get an encoder using the given storage. pub fn with_storage<'z, S: Storage>(&self, storage: S) -> Result, Error> { - let mut header = Header::default(); + let mut header = Header{sequence_type:self.sequence_type, ..Default::default()}; - header.sequence_type = self.sequence_type; if self.sequence_type == SequenceType::Dna { header.format_version = FormatVersion::V1; } else { @@ -184,6 +204,9 @@ impl EncoderBuilder { header.flags.set(Flag::Quality); header.flags.set(Flag::Length); } + if self.title { + header.flags.set(Flag::Title); + } let lens = self.new_buffer(&storage)?; let id = if self.id { @@ -210,6 +233,19 @@ impl EncoderBuilder { None }; + let title = if self.title { + let title_buffer = storage.create_buffer()?; + Some(BufWriter::new(title_buffer)) + } else { + None + }; + + /*let mask = if self.mask { + Some(WriteCounter::new(self.new_buffer(&storage)?)) + } else { + None + };*/ + Ok(Encoder { header, storage, @@ -217,6 +253,8 @@ impl EncoderBuilder { qual, com, id, + title, + mask: self.mask, len: WriteCounter::new(lens), }) } @@ -232,12 +270,13 @@ impl EncoderBuilder { pub struct Encoder<'z, S: Storage> { header: Header, storage: S, + title: Option>, id: Option>>, len: WriteCounter>, com: Option>>, seq: Option>>>, qual: Option>>, - // mask: WriteCounter>, + mask: bool,//Option>>, } impl Encoder<'_, S> { @@ -271,12 +310,12 @@ impl Encoder<'_, S> { if let Some(seq) = record.sequence.as_ref() { let length = seq.len(); write_length(length as u64, &mut self.len)?; - if let Err(e) = seq_writer.write(seq.as_bytes()) { + if self.mask { todo!() } + if let Err(e) = seq_writer.write(seq) { if e.kind() == std::io::ErrorKind::InvalidData { return Err(Error::InvalidSequence); - } else { - return Err(Error::Io(e)); - } + } + return Err(Error::Io(e)); } seq_writer.flush()?; } else { @@ -299,6 +338,21 @@ impl Encoder<'_, S> { Ok(()) } + /// Push a Title to the archive. + /// + /// The title is a string slice which is stored uncompressed + pub fn push_title(&mut self, title: &str) -> Result<(), Error> { + if let Some(title_writer) = self.title.as_mut() { + let length = title.len(); + write_length(length as u64, &mut self.len)?; + title_writer.write_all(title.as_bytes())?; + title_writer.flush()?; + } else { + return Err(Error::MissingField("title")); + } + Ok(()) + } + /// Finalize the archive and write it to the given writer. /// /// This method consumes the [`Encoder`], since it cannot receive any @@ -326,6 +380,15 @@ impl Encoder<'_, S> { write_variable_length(self.header.line_length, &mut file)?; write_variable_length(self.header.number_of_sequences, &mut file)?; + if let Some(title) = self.title { + let title_buffer_result = title.into_inner(); + if let Ok(title_buffer) = title_buffer_result { + let title_length = self.storage.buffer_length(&title_buffer)?; + write_variable_length(title_length, &mut file)?; + self.storage.write_buffer(title_buffer, &mut file)?; + } + } + // -- ids --- macro_rules! write_block { @@ -371,7 +434,7 @@ mod tests { let r1 = Record { id: Some("r1".into()), comment: Some("record 1".into()), - sequence: Some("MYYK".into()), + sequence: Some(b"MYYK".into()), ..Default::default() }; encoder.push(&r1).unwrap(); @@ -379,7 +442,7 @@ mod tests { let r2 = Record { id: Some("r2".into()), comment: Some("record 2".into()), - sequence: Some("MTTE".into()), + sequence: Some(b"MTTE".into()), ..Default::default() }; encoder.push(&r2).unwrap(); @@ -400,7 +463,7 @@ mod tests { let r1 = Record { id: Some("r1".into()), comment: Some("record 1".into()), - sequence: Some("ATTATTGC".into()), + sequence: Some(b"ATTATTGC".into()), ..Default::default() }; encoder.push(&r1).unwrap(); @@ -408,7 +471,7 @@ mod tests { let r2 = Record { id: Some("r2".into()), comment: Some("record 2".into()), - sequence: Some("ATATGVBGD".into()), + sequence: Some(b"ATATGVBGD".into()), ..Default::default() }; encoder.push(&r2).unwrap(); @@ -419,6 +482,7 @@ mod tests { let decoder = crate::Decoder::from_path("/tmp/test2.naf").unwrap(); let records = decoder.collect::>(); + println!("{:?}",records); assert_eq!(records.len(), 2); } } diff --git a/nafcodec/src/encoder/writer.rs b/nafcodec/src/encoder/writer.rs index fd942b7..606d892 100644 --- a/nafcodec/src/encoder/writer.rs +++ b/nafcodec/src/encoder/writer.rs @@ -1,7 +1,14 @@ +use cfg_if::cfg_if; use std::io::Error as IoError; use std::io::Write; use crate::data::SequenceType; +#[cfg(feature="lut")] +const MINIBYTE_LUT: [u8;32] = [ + 0, 8, 7, 4, 11, 0, 0, 2,13, + 0, 0, 3, 0,12,15, 0, 0, + 0,10, 6, 1, 1,14, 9, 0, + 5, 0, 0, 0, 0, 0, 0]; pub struct SequenceWriter { ty: SequenceType, @@ -21,12 +28,24 @@ impl SequenceWriter { pub fn into_inner(mut self) -> Result { // make sure to write the last letter of the last sequence if any. if let Some(letter) = self.cache.take() { - self.writer.write_all(&[self.encode(letter)?])?; + cfg_if!{ + if #[cfg(features="lut")] { + self.writer.write_all(&[self.encode_lut(letter)?])?; + } else { + self.writer.write_all(&[self.encode(letter)?])?; + } + } } self.writer.flush()?; Ok(self.writer) } + #[cfg(feature="lut")] + #[inline] + fn encode_lut(&self, c: u8) -> Result { + Ok(MINIBYTE_LUT[usize::from(c&(30|c>>6))]) + } + #[inline] fn encode(&self, c: u8) -> Result { match c { @@ -69,7 +88,13 @@ impl Write for SequenceWriter { let mut bytes = s; let mut encoded = Vec::with_capacity((s.len() + 1) / 2); if let Some(letter) = self.cache.take() { - let c = (self.encode(s[0])? << 4) | self.encode(letter)?; + cfg_if!{ + if #[cfg(feature="lut")] { + let c = (self.encode_lut(s[0])? << 4) | self.encode(letter)?; + } else { + let c = (self.encode(s[0])? << 4) | self.encode(letter)?; + } + } encoded.push(c); bytes = &s[1..]; } @@ -79,7 +104,13 @@ impl Write for SequenceWriter { assert!(self.cache.is_none()); self.cache = Some(chunk[0]); } else { - let c = (self.encode(chunk[1])? << 4) | self.encode(chunk[0])?; + cfg_if!{ + if #[cfg(feature="lut")] { + let c = (self.encode_lut(chunk[1])? << 4) | self.encode_lut(chunk[0])?; + } else { + let c = (self.encode(chunk[1])? << 4) | self.encode(chunk[0])?; + } + } encoded.push(c); } } diff --git a/nafcodec/src/lib.rs b/nafcodec/src/lib.rs index 9cd906f..6e3c3a7 100644 --- a/nafcodec/src/lib.rs +++ b/nafcodec/src/lib.rs @@ -1,6 +1,12 @@ #![doc = include_str!("../README.md")] #![cfg_attr(feature = "nightly", feature(seek_stream_len))] #![cfg_attr(feature = "nightly", feature(iter_advance_by))] +#![warn( + missing_docs, + clippy::unwrap_used, + clippy::expect_used, + clippy::pedantic +)] mod data; mod decoder; diff --git a/nafcodec/tests/decoder/dna.rs b/nafcodec/tests/decoder/dna.rs index 54d3925..81ec3a0 100644 --- a/nafcodec/tests/decoder/dna.rs +++ b/nafcodec/tests/decoder/dna.rs @@ -1,6 +1,7 @@ use nafcodec::Decoder; use nafcodec::DecoderBuilder; use nafcodec::SequenceType; +use std::str; const GENOME: &[u8] = include_bytes!("../../../data/NZ_AAEN01000029.naf"); const MASKED: &[u8] = include_bytes!("../../../data/masked.naf"); @@ -20,10 +21,10 @@ fn decode() { assert_eq!(r1.comment.unwrap(), "Bacillus anthracis str. CNEVA-9066 map unlocalized plasmid pXO1 cont2250, whole genome shotgun sequence"); let seq = r1.sequence.unwrap(); assert_eq!(seq.len(), 182777); - assert_eq!(seq.chars().filter(|&x| x == 'A').count(), 62115); - assert_eq!(seq.chars().filter(|&x| x == 'C').count(), 28747); - assert_eq!(seq.chars().filter(|&x| x == 'G').count(), 30763); - assert_eq!(seq.chars().filter(|&x| x == 'T').count(), 61152); + assert_eq!(str::from_utf8(&seq).unwrap().chars().filter(|&x| x == 'A').count(), 62115); + assert_eq!(str::from_utf8(&seq).unwrap().chars().filter(|&x| x == 'C').count(), 28747); + assert_eq!(str::from_utf8(&seq).unwrap().chars().filter(|&x| x == 'G').count(), 30763); + assert_eq!(str::from_utf8(&seq).unwrap().chars().filter(|&x| x == 'T').count(), 61152); let r2 = decoder.next().unwrap().unwrap(); assert_eq!(r2.id.unwrap(), "NZ_AAEN01000030.3"); @@ -46,18 +47,18 @@ fn mask() { let r1 = decoder.next().unwrap().unwrap(); assert_eq!(r1.id.unwrap(), "test1"); let seq = r1.sequence.unwrap(); - assert!(seq[..657].chars().all(|x| x.is_uppercase())); - assert!(seq[657..676].chars().all(|x| x.is_lowercase())); - assert!(seq[676..1311].chars().all(|x| x.is_uppercase())); - assert!(seq[1311..1350].chars().all(|x| x.is_lowercase())); + assert!(str::from_utf8(&seq[..657]).unwrap().chars().all(|x| x.is_uppercase())); + assert!(str::from_utf8(&seq[657..676]).unwrap().chars().all(|x| x.is_lowercase())); + assert!(str::from_utf8(&seq[676..1311]).unwrap().chars().all(|x| x.is_uppercase())); + assert!(str::from_utf8(&seq[1311..1350]).unwrap().chars().all(|x| x.is_lowercase())); let r2 = decoder.next().unwrap().unwrap(); assert_eq!(r2.id.unwrap(), "test2"); let seq = r2.sequence.unwrap(); - assert!(seq[..525].chars().all(|x| x.is_uppercase())); - assert!(seq[525..621].chars().all(|x| x.is_lowercase())); - assert!(seq[621..720].chars().all(|x| x.is_uppercase())); - assert!(seq[720..733].chars().all(|x| x.is_lowercase())); + assert!(str::from_utf8(&seq[..525] ).unwrap().chars().all(|x| x.is_uppercase())); + assert!(str::from_utf8(&seq[525..621]).unwrap().chars().all(|x| x.is_lowercase())); + assert!(str::from_utf8(&seq[621..720]).unwrap().chars().all(|x| x.is_uppercase())); + assert!(str::from_utf8(&seq[720..733]).unwrap().chars().all(|x| x.is_lowercase())); assert!(decoder.next().is_none()); } @@ -77,12 +78,12 @@ fn force_nomask() { let r1 = decoder.next().unwrap().unwrap(); assert_eq!(r1.id.unwrap(), "test1"); let seq = r1.sequence.unwrap(); - assert!(seq[..].chars().all(|x| x.is_uppercase())); + assert!(str::from_utf8(&seq[..]).unwrap().chars().all(|x| x.is_uppercase())); let r2 = decoder.next().unwrap().unwrap(); assert_eq!(r2.id.unwrap(), "test2"); let seq = r2.sequence.unwrap(); - assert!(seq[..].chars().all(|x| x.is_uppercase())); + assert!(str::from_utf8(&seq[..]).unwrap().chars().all(|x| x.is_uppercase())); assert!(decoder.next().is_none()); } diff --git a/nafcodec/tests/decoder/fastq.rs b/nafcodec/tests/decoder/fastq.rs index 7234e78..1b17622 100644 --- a/nafcodec/tests/decoder/fastq.rs +++ b/nafcodec/tests/decoder/fastq.rs @@ -3,6 +3,7 @@ use nafcodec::DecoderBuilder; use nafcodec::Flag; use nafcodec::Header; use nafcodec::SequenceType; +use std::str; const ARCHIVE: &[u8] = include_bytes!("../../../data/phix.naf"); @@ -40,7 +41,8 @@ fn decode() { "a comment that should not be included in the SAM output" ); let seq = r1.sequence.unwrap(); - assert!(seq.starts_with("NGCTCTTAAACCTGCTATTGAGGCTTGTGGCATTTC")); + let seq_str = str::from_utf8(&seq).unwrap(); + assert_eq!(&seq_str[0..(67-31)],"NGCTCTTAAACCTGCTATTGAGGCTTGTGGCATTTC"); let qual = r1.quality.unwrap(); assert!(qual.starts_with("#8CCCGGGGGGGGGGGGGGGGGGGGGGGGGG")); diff --git a/nafcodec/tests/encoder.rs b/nafcodec/tests/encoder.rs index e24920a..0f7c0da 100644 --- a/nafcodec/tests/encoder.rs +++ b/nafcodec/tests/encoder.rs @@ -14,14 +14,14 @@ fn test_records() -> Vec> { Record { id: Some(Cow::from("r1")), comment: Some(Cow::from("record 1")), - sequence: Some(Cow::from("NGCTCTTAAACCTGCTA")), + sequence: Some(Cow::from(b"NGCTCTTAAACCTGCTA")), quality: Some(Cow::from("#8CCCGGGGGGGGGGGG")), length: Some(17), }, Record { id: Some(Cow::from("r2")), comment: Some(Cow::from("record 2")), - sequence: Some(Cow::from("NTAATAAGCAATGACGGCAGC")), + sequence: Some(Cow::from(b"NTAATAAGCAATGACGGCAGC")), quality: Some(Cow::from("#8AACCFF