Skip to content

Commit 654b271

Browse files
authored
Merge pull request #83 from remiolsen/unknown_improvements
Finalising 0.7 release
2 parents 249360c + 68302fc commit 654b271

File tree

3 files changed

+55
-11
lines changed

3 files changed

+55
-11
lines changed

anglerfish/anglerfish.py

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from collections import Counter
1010
from itertools import groupby
1111

12+
import Levenshtein as lev
1213
import numpy as np
1314
import pkg_resources
1415

@@ -34,7 +35,6 @@ def run_demux(args):
3435
if args.debug:
3536
log.setLevel(logging.DEBUG)
3637
run_uuid = str(uuid.uuid4())
37-
os.mkdir(args.out_fastq)
3838
ss = SampleSheet(args.samplesheet, args.ont_barcodes)
3939
version = pkg_resources.get_distribution("bio-anglerfish").version
4040
report = Report(args.run_name, run_uuid, version)
@@ -80,7 +80,12 @@ def run_demux(args):
8080
adaptors_sorted[(entry.adaptor.name, entry.ont_barcode)].append(
8181
(entry.sample_name, entry.adaptor, os.path.abspath(entry.fastq))
8282
)
83-
83+
if os.path.exists(args.out_fastq):
84+
raise FileExistsError(
85+
f"Output folder '{args.out_fastq}' already exists. Please remove it or specify another --run_name"
86+
)
87+
else:
88+
os.mkdir(args.out_fastq)
8489
out_fastqs = []
8590
for key, sample in adaptors_sorted.items():
8691
adaptor_name, ont_barcode = key
@@ -181,6 +186,7 @@ def run_demux(args):
181186
f" Lenient mode: Reverse complementing {best_flip} index for adaptor {adaptor_name} found at least {args.lenient_factor} times more matches"
182187
)
183188
no_matches, matches = flipped[best_flip]
189+
flipped_i7, flipped_i5 = flips[best_flip].values()
184190
else:
185191
log.info(
186192
f" Lenient mode: using original index orientation for {adaptor_name}"
@@ -238,11 +244,41 @@ def run_demux(args):
238244

239245
# Top unmatched indexes
240246
nomatch_count = Counter([x[3] for x in no_matches])
241-
if args.max_unknowns is None:
247+
if args.max_unknowns == 0:
242248
args.max_unknowns = len([sample for sample in ss]) + 10
243-
report.add_unmatched_stat(
244-
nomatch_count.most_common(args.max_unknowns), ont_barcode, adaptor_name
245-
)
249+
250+
# We search for the closest sample in the samplesheet to the list of unknowns
251+
top_unknowns = []
252+
for i in nomatch_count.most_common(args.max_unknowns):
253+
sample_dists = [
254+
(
255+
lev.distance(
256+
i[0], f"{x.adaptor.i7_index}+{x.adaptor.i5_index}".lower()
257+
),
258+
x.sample_name,
259+
)
260+
for x in ss
261+
]
262+
closest_sample = min(sample_dists, key=lambda x: x[0])
263+
# If the distance is more than half the index length, we remove it
264+
if closest_sample[0] >= (len(i[0]) / 2) + 1:
265+
closest_sample = (closest_sample[0], None)
266+
else:
267+
# We might have two samples with the same distance
268+
all_min = [
269+
x[1]
270+
for x in sample_dists
271+
if x[0] == closest_sample[0] and x[1] != closest_sample[1]
272+
]
273+
# This list might be too long, so we truncate it
274+
if len(all_min) > 4:
275+
all_min = all_min[:4]
276+
all_min.append("...")
277+
if all_min:
278+
closest_sample = (closest_sample[0], ";".join(all_min))
279+
280+
top_unknowns.append([i[0], i[1], closest_sample[1]])
281+
report.add_unmatched_stat(top_unknowns, ont_barcode, adaptor_name)
246282

247283
# Check if there were samples in the samplesheet without adaptor alignments and add them to report
248284
for entry in ss:

anglerfish/cli.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,9 @@ def run(
269269
)
270270
utcnow = dt.datetime.now(dt.timezone.utc)
271271
runname = utcnow.strftime(f"{args.run_name}_%Y_%m_%d_%H%M%S")
272+
if run_name != "anglerfish":
273+
runname = args.run_name
274+
272275
assert os.path.exists(args.out_fastq), f"Output folder '{args.out_fastq}' not found"
273276
assert os.path.exists(
274277
args.samplesheet

anglerfish/demux/report.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99

1010
class Report:
11-
unmatch_header = ["index", "num_reads", "ont_barcode"]
11+
unmatch_header = ["index", "num_reads", "ont_barcode", "closest_match"]
1212

1313
def __init__(self, run_name, uuid, version):
1414
self.run_name = run_name
@@ -44,8 +44,8 @@ def write_report(self, outdir):
4444
uhead = getattr(Report, "unmatch_header")
4545
f.write(f"\n{chr(9).join(uhead)}\n") # chr(9) = tab
4646
for key, unmatch in self.unmatched_stats.items():
47-
for idx, mnum in unmatch:
48-
f.write(f"{idx}\t{mnum}\t{key[0]}\n")
47+
for idx, mnum, closest in unmatch:
48+
f.write(f"{idx}\t{mnum}\t{key[0]}\t{closest}\n")
4949
log.debug(
5050
f"Wrote anglerfish_stats.txt to {outdir}, size: {os.path.getsize(os.path.join(outdir, 'anglerfish_stats.txt'))} bytes"
5151
)
@@ -75,9 +75,14 @@ def write_json(self, outdir):
7575
dict(zip(getattr(SampleStat, "header"), slist))
7676
)
7777
for key, unmatch in self.unmatched_stats.items():
78-
for idx, mnum in unmatch:
78+
for idx, mnum, closest in unmatch:
7979
json_out["undetermined"].append(
80-
dict(zip(getattr(Report, "unmatch_header"), [idx, mnum, key[0]]))
80+
dict(
81+
zip(
82+
getattr(Report, "unmatch_header"),
83+
[idx, mnum, key[0], closest],
84+
)
85+
)
8186
)
8287
with open(os.path.join(outdir, "anglerfish_stats.json"), "w") as f:
8388
f.write(json.dumps(json_out, indent=2, sort_keys=True))

0 commit comments

Comments
 (0)