Skip to content

Rewrote ennaf/unnaf Tool in Rust to test nafcodec library functionality #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
/target
/Cargo.lock
.venv
__pycache__
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[workspace]
members = ["nafcodec", "nafcodec-py"]
members = ["nafcodec", "nafcodec-py", "nafcodec-ffi", "naf_rs"]
resolver = "2"
89 changes: 89 additions & 0 deletions data/test-runner.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env perl
#
# Test runner script
# Copyright (c) 2018-2021 Kirill Kryukov
# See README.md and LICENSE files of this repository
#

use strict;
use File::Basename qw(basename dirname);
use File::Glob qw(:bsd_glob);

my @tests = @ARGV;
if (!scalar @tests) { die "Tests to run are not specified\n"; }

my $null = ($^O =~ /MSWin/) ? 'nul' : '/dev/null';
my ($n_tests_total, $n_tests_passed) = (0, 0);

foreach my $test (@tests)
{
if (-e $test and -d $test) { run_test_set($test); }
else { run_test($test); }
}

print "$n_tests_passed out of $n_tests_total tests passed\n";
if ($n_tests_passed != $n_tests_total) { exit 1; }


sub run_test_set
{
my ($dir) = @_;
print "===== $dir =====\n";
if (!-e $dir or !-d $dir) { die "Can't find test set directory \"$dir\"\n"; }
foreach my $file (bsd_glob("$dir/*.test")) { run_test($file); }
}

sub run_test
{
my ($test_file) = @_;
my ($dir, $name) = (dirname($test_file), basename($test_file));
$name =~ s/\.test$//;
my $test_prefix = "$dir/$name";
my $group = $name;
$group =~ s/-.*$//;
my $group_prefix = "$dir/$group";

print "[$dir/$name] ";
$n_tests_total++;

open(my $TEST, '<', $test_file) or die "Can't open \"$test_file\"\n";
binmode $TEST;
my @cmds;
while (<$TEST>)
{
s/[\x0D\x0A]+$//;
my $cmd = $_;
$cmd =~ s/ennaf/..\/ennaf\/ennaf --binary-stderr/g;
$cmd =~ s/unnaf/..\/unnaf\/unnaf --binary-stderr --binary-stdout/g;
$cmd =~ s/\{TEST\}/$test_prefix/g;
$cmd =~ s/\{GROUP\}/$group_prefix/g;
push @cmds, $cmd;
system($cmd);
}
close $TEST;

my @errors;
foreach my $ref_file (bsd_glob("$dir/$name.*-ref"))
{
my $out_file = $ref_file;
$out_file =~ s/-ref$//;
if (-e $out_file)
{
my $cmperr = system("diff -q $ref_file $out_file >$null");
if ($cmperr != 0) { push @errors, "\"" . basename($out_file) . "\" differs from \"" . basename($ref_file) . "\""; }
}
else { push @errors, "Can't find output file \"$out_file\""; }
}

if (scalar(@errors) == 0)
{
print "OK\n";
$n_tests_passed++;
}
else
{
print "FAIL\n";
print "Commands:\n", join("\n", map { " $_" } @cmds), "\n";
print "Errors:\n", join("\n", map { " $_" } @errors), "\n";
}
}
12 changes: 12 additions & 0 deletions naf_rs/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[package]
name = "naf_rs"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
bio = "2.0.1"
clap = { version = "4.5.9", features = ["derive", "env"] }
nafcodec = { version = "0.2.0", path = "../nafcodec" }
tempfile = "3.10.1"
Binary file added naf_rs/LuxC.naf
Binary file not shown.
79 changes: 79 additions & 0 deletions naf_rs/src/ennaf.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
use nafcodec::{EncoderBuilder,Memory,Flag,Record};
use std::borrow::Cow;
use std::fs::File;
use std::io::BufWriter;
use std::str;
use bio::io::{fasta,fastq};
use crate::EnnafArgs;
use crate::FileFormat;
use crate::SequenceTypeArg;

pub fn encode_file(args: &EnnafArgs) {
let all_flags = args.format.get_flags_for_format() | Flag::Length |
{ if !args.no_mask && (args.sequence == SequenceTypeArg::Dna || args.sequence == SequenceTypeArg::Rna)
{Flag::Mask}
else
{Flag::Length}
} |
{ if let Some(_) = args.title
{Flag::Title}
else
{Flag::Length}
};
let mut encoder = EncoderBuilder::from_flags(args.sequence.into_codec_type(), all_flags).with_storage(Memory).unwrap();
if let Some(title) = &args.title {
encoder.push_title(title);
}
match args.format {
FileFormat::Fasta => {
let fasta_file = File::open(args.filename.to_owned()).unwrap();
let fasta_reader = fasta::Reader::new(fasta_file);
for results in fasta_reader.records() {
let record = results.unwrap();
let r = Record{
id:Some(Cow::from(record.id())),
comment:{if let Some(desc) = record.desc() {
Some(Cow::from(desc))
} else {
None
}},
sequence:Some(Cow::from(record.seq().to_vec())),
length: Some(record.seq().len() as u64),
quality: None
};
println!{"Parsed record {:?}",r};
if let Err(e) = encoder.push(&r) {
panic!{"Could not push record {:?} to encoder {}",r,e};
}
}
},
FileFormat::Fastq => {
let fastq_file = File::open(args.filename.to_owned()).unwrap();
let fastq_reader = fastq::Reader::new(fastq_file);
for results in fastq_reader.records() {
let record = results.unwrap();
let r = Record{
id:Some(Cow::from(record.id())),
comment:{if let Some(desc) = record.desc() {
Some(Cow::from(desc))
} else {
None
}},
sequence:Some(Cow::from(record.seq())),
length: Some(record.seq().len() as u64),
quality: Some(Cow::from(str::from_utf8(record.qual()).unwrap()))
};
if let Err(e) = encoder.push(&r) {
panic!{"Could not push record {:?} to encoder {}",r,e};
}
}
}
}
match &args.output {
Some(outfile) => encoder.write(BufWriter::new(File::create(outfile.to_owned()).unwrap())),

None => encoder.write(BufWriter::new(std::io::stdout()))
};
}


174 changes: 174 additions & 0 deletions naf_rs/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
use clap::{Parser, Subcommand, ValueEnum};
use nafcodec::{SequenceType,Flag,Flags};
use std::fmt;

mod ennaf;
mod unnaf;

#[derive(Debug,Subcommand)]
enum Process {
ENNAF ( EnnafArgs ),
UNNAF ( UnnafArgs )
}

#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
enum FileFormat {
Fasta,
Fastq
}

impl fmt::Display for FileFormat {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result{
match self {
FileFormat::Fasta => write!(f, "FASTA"),
FileFormat::Fastq => write!(f, "FASTQ")
}
}
}

impl FileFormat {
fn get_flags_for_format(&self) -> Flags {
match self{
FileFormat::Fasta => Flag::Sequence | Flag::Id | Flag::Comment,
FileFormat::Fastq => Flag::Sequence | Flag::Id | Flag::Quality | Flag::Comment,
}
}
}

#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
enum SequenceTypeArg {
Dna,
Rna,
Protein,
Text
}

impl fmt::Display for SequenceTypeArg {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result{
match self {
SequenceTypeArg::Dna => write!(f, "DNA"),
SequenceTypeArg::Rna => write!(f, "RNA"),
SequenceTypeArg::Protein => write!(f, "protein"),
SequenceTypeArg::Text => write!(f, "text")
}
}
}

impl SequenceTypeArg {
fn into_codec_type(&self) -> SequenceType {
match self {
SequenceTypeArg::Dna => SequenceType::Dna,
SequenceTypeArg::Rna => SequenceType::Rna,
SequenceTypeArg::Text => SequenceType::Text,
SequenceTypeArg::Protein => SequenceType::Protein
}
}
}

#[derive(Parser,Debug)]
#[command(name="nafrs",version = "0.4", about = "Encode/decode NAF files", long_about = None)]
struct Args {
#[command(subcommand)]
process: Process
}

#[derive(Debug,Parser)]
struct EnnafArgs {
#[arg(short,long,value_name="FILE",help="Write compressed output to FILE -- write to STDOUT if not specified")]
output: Option<String>,
#[arg(short='#',long,value_name="N",default_value_t=1,help="Use compression level N (from -131072 to 22)")]
level: u16,
#[arg(long,value_name="N",default_value_t=11,help="Use window size 2^N for sequence stream (from 10 to 31)")]
long: u8,
#[arg(long,value_name="DIR",env="TMP",help="Use DIR as temporary directory, (overrides TMP environment variable)")]
temp_dir: Option<String>,
#[arg(long,value_name="NAME",help="Use NAME as prefix for temporary files")]
name: Option<String>,
#[arg(long,value_name="TITLE",help="Use TITLE as dataset title")]
title: Option<String>,
#[arg(short,long,value_name="FORMAT",help="Input file type",default_value_t=FileFormat::Fasta)]
format: FileFormat,
#[arg(short,long,value_name="SEQTYPE",help="Input sequence type",default_value_t=SequenceTypeArg::Dna)]
sequence: SequenceTypeArg,
#[arg(long,default_value_t=false,help="Fail on unexpected input characters")]
strict: bool,
#[arg(long,value_name="N",default_value_t=80,help="Override line length to N")]
line_length: u16,
#[arg(long,default_value_t=false,help="Verbose mode")]
verbose: bool,
#[arg(long,default_value_t=false,help="Keep temporary files")]
keep_temp_files: bool,
#[arg(long,default_value_t=false,help="Don't store mask")]
no_mask: bool,
filename: String
}

#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
enum UnnafOutput {
Format, // file format & version
PartList, // list of parts
Sizes, // size of parts (original, compressed Band ratio)
Number, // number of records
Title, // frame title
Ids, // record IDs (first part of FASTA name)
Names, // full FASTA names
Lengths, // sequence lengths
TotalLength, // sum of sequence lengths
Mask, // mask regions
FourBit, // four-bit encoded sequences
Seq, // all sequences concatenated (no newlines)
Sequences, // sequences separated by newlines
Fasta, // FASTA file output
Fastq // FASTQ file output
}

impl fmt::Display for UnnafOutput {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result{
match self {
UnnafOutput::Format => write!(f,"Format"),
UnnafOutput::PartList => write!(f,"Part List"),
UnnafOutput::Sizes => write!(f,"Sizes"),
UnnafOutput::Number => write!(f,"Number"),
UnnafOutput::Title => write!(f,"Title"),
UnnafOutput::Ids => write!(f,"IDs"),
UnnafOutput::Names => write!(f,"Names"),
UnnafOutput::Lengths => write!(f,"Lengths"),
UnnafOutput::TotalLength => write!(f,"Total Lengths"),
UnnafOutput::Mask => write!(f,"Mask"),
UnnafOutput::FourBit => write!(f,"Four Bit"),
UnnafOutput::Seq => write!(f,"Seq"),
UnnafOutput::Sequences => write!(f,"Sequences"),
UnnafOutput::Fasta => write!(f,"FASTA"),
UnnafOutput::Fastq => write!(f,"FASTQ")
}
}
}

#[derive(Debug,Parser)]
struct UnnafArgs {
#[arg(short,long,value_name="FILE",help="Write uncompressed output to FILE -- read to STDOUT if not specified")]
output: Option<String>,
#[arg(short='t',long,default_value_t=UnnafOutput::Fasta)]
output_type: UnnafOutput,
#[arg(long,value_name="N",default_value_t=80,help="Override line length to N")]
line_length: u16,
#[arg(long,default_value_t=false,help="Ignore Mask")]
no_mask: bool,
#[arg(long,default_value_t=false,help="Set STDOUT stream to binary mode")]
binary_stdout: bool,
#[arg(long,default_value_t=false,help="Set STDERR stream to binary mode")]
binary_stderr: bool,
filename: String
}

fn main() {
let args=Args::parse();
match args.process {
Process::ENNAF (ennaf_args) => {
ennaf::encode_file(&ennaf_args);
}
Process::UNNAF (unnaf_args) => {
unnaf::decode_naf(&unnaf_args);
}
};
}
Loading