9
9
from collections import Counter
10
10
from itertools import groupby
11
11
12
+ import Levenshtein as lev
12
13
import numpy as np
13
14
import pkg_resources
14
15
@@ -34,7 +35,6 @@ def run_demux(args):
34
35
if args .debug :
35
36
log .setLevel (logging .DEBUG )
36
37
run_uuid = str (uuid .uuid4 ())
37
- os .mkdir (args .out_fastq )
38
38
ss = SampleSheet (args .samplesheet , args .ont_barcodes )
39
39
version = pkg_resources .get_distribution ("bio-anglerfish" ).version
40
40
report = Report (args .run_name , run_uuid , version )
@@ -80,7 +80,12 @@ def run_demux(args):
80
80
adaptors_sorted [(entry .adaptor .name , entry .ont_barcode )].append (
81
81
(entry .sample_name , entry .adaptor , os .path .abspath (entry .fastq ))
82
82
)
83
-
83
+ if os .path .exists (args .out_fastq ):
84
+ raise FileExistsError (
85
+ f"Output folder '{ args .out_fastq } ' already exists. Please remove it or specify another --run_name"
86
+ )
87
+ else :
88
+ os .mkdir (args .out_fastq )
84
89
out_fastqs = []
85
90
for key , sample in adaptors_sorted .items ():
86
91
adaptor_name , ont_barcode = key
@@ -181,6 +186,7 @@ def run_demux(args):
181
186
f" Lenient mode: Reverse complementing { best_flip } index for adaptor { adaptor_name } found at least { args .lenient_factor } times more matches"
182
187
)
183
188
no_matches , matches = flipped [best_flip ]
189
+ flipped_i7 , flipped_i5 = flips [best_flip ].values ()
184
190
else :
185
191
log .info (
186
192
f" Lenient mode: using original index orientation for { adaptor_name } "
@@ -238,11 +244,41 @@ def run_demux(args):
238
244
239
245
# Top unmatched indexes
240
246
nomatch_count = Counter ([x [3 ] for x in no_matches ])
241
- if args .max_unknowns is None :
247
+ if args .max_unknowns == 0 :
242
248
args .max_unknowns = len ([sample for sample in ss ]) + 10
243
- report .add_unmatched_stat (
244
- nomatch_count .most_common (args .max_unknowns ), ont_barcode , adaptor_name
245
- )
249
+
250
+ # We search for the closest sample in the samplesheet to the list of unknowns
251
+ top_unknowns = []
252
+ for i in nomatch_count .most_common (args .max_unknowns ):
253
+ sample_dists = [
254
+ (
255
+ lev .distance (
256
+ i [0 ], f"{ x .adaptor .i7_index } +{ x .adaptor .i5_index } " .lower ()
257
+ ),
258
+ x .sample_name ,
259
+ )
260
+ for x in ss
261
+ ]
262
+ closest_sample = min (sample_dists , key = lambda x : x [0 ])
263
+ # If the distance is more than half the index length, we remove it
264
+ if closest_sample [0 ] >= (len (i [0 ]) / 2 ) + 1 :
265
+ closest_sample = (closest_sample [0 ], None )
266
+ else :
267
+ # We might have two samples with the same distance
268
+ all_min = [
269
+ x [1 ]
270
+ for x in sample_dists
271
+ if x [0 ] == closest_sample [0 ] and x [1 ] != closest_sample [1 ]
272
+ ]
273
+ # This list might be too long, so we truncate it
274
+ if len (all_min ) > 4 :
275
+ all_min = all_min [:4 ]
276
+ all_min .append ("..." )
277
+ if all_min :
278
+ closest_sample = (closest_sample [0 ], ";" .join (all_min ))
279
+
280
+ top_unknowns .append ([i [0 ], i [1 ], closest_sample [1 ]])
281
+ report .add_unmatched_stat (top_unknowns , ont_barcode , adaptor_name )
246
282
247
283
# Check if there were samples in the samplesheet without adaptor alignments and add them to report
248
284
for entry in ss :
0 commit comments