Skip to content

Commit e43d8f4

Browse files
committed
Add matching unknown indices to samplesheet
1 parent 4e6c8cd commit e43d8f4

File tree

2 files changed

+45
-9
lines changed

2 files changed

+45
-9
lines changed

anglerfish/anglerfish.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from collections import Counter
1010
from itertools import groupby
1111

12+
import Levenshtein as lev
1213
import numpy as np
1314
import pkg_resources
1415

@@ -242,11 +243,41 @@ def run_demux(args):
242243

243244
# Top unmatched indexes
244245
nomatch_count = Counter([x[3] for x in no_matches])
245-
if args.max_unknowns is None:
246+
if args.max_unknowns == 0:
246247
args.max_unknowns = len([sample for sample in ss]) + 10
247-
report.add_unmatched_stat(
248-
nomatch_count.most_common(args.max_unknowns), ont_barcode, adaptor_name
249-
)
248+
249+
# We search for the closest sample in the samplesheet to the list of unknowns
250+
top_unknowns = []
251+
for i in nomatch_count.most_common(args.max_unknowns):
252+
sample_dists = [
253+
(
254+
lev.distance(
255+
i[0], f"{x.adaptor.i7_index}+{x.adaptor.i5_index}".lower()
256+
),
257+
x.sample_name,
258+
)
259+
for x in ss
260+
]
261+
closest_sample = min(sample_dists, key=lambda x: x[0])
262+
# If the distance is more than half the index length, we remove it
263+
if closest_sample[0] >= (len(i[0]) / 2) + 1:
264+
closest_sample = (closest_sample[0], None)
265+
else:
266+
# We might have two samples with the same distance
267+
all_min = [
268+
x[1]
269+
for x in sample_dists
270+
if x[0] == closest_sample[0] and x[1] != closest_sample[1]
271+
]
272+
# This list might be too long, so we truncate it
273+
if len(all_min) > 4:
274+
all_min = all_min[:4]
275+
all_min.append("...")
276+
if all_min:
277+
closest_sample = (closest_sample[0], ";".join(all_min))
278+
279+
top_unknowns.append([i[0], i[1], closest_sample[1]])
280+
report.add_unmatched_stat(top_unknowns, ont_barcode, adaptor_name)
250281

251282
# Check if there were samples in the samplesheet without adaptor alignments and add them to report
252283
for entry in ss:

anglerfish/demux/report.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99

1010
class Report:
11-
unmatch_header = ["index", "num_reads", "ont_barcode"]
11+
unmatch_header = ["index", "num_reads", "ont_barcode", "closest_match"]
1212

1313
def __init__(self, run_name, uuid, version):
1414
self.run_name = run_name
@@ -44,8 +44,8 @@ def write_report(self, outdir):
4444
uhead = getattr(Report, "unmatch_header")
4545
f.write(f"\n{chr(9).join(uhead)}\n") # chr(9) = tab
4646
for key, unmatch in self.unmatched_stats.items():
47-
for idx, mnum in unmatch:
48-
f.write(f"{idx}\t{mnum}\t{key[0]}\n")
47+
for idx, mnum, closest in unmatch:
48+
f.write(f"{idx}\t{mnum}\t{key[0]}\t{closest}\n")
4949
log.debug(
5050
f"Wrote anglerfish_stats.txt to {outdir}, size: {os.path.getsize(os.path.join(outdir, 'anglerfish_stats.txt'))} bytes"
5151
)
@@ -75,9 +75,14 @@ def write_json(self, outdir):
7575
dict(zip(getattr(SampleStat, "header"), slist))
7676
)
7777
for key, unmatch in self.unmatched_stats.items():
78-
for idx, mnum in unmatch:
78+
for idx, mnum, closest in unmatch:
7979
json_out["undetermined"].append(
80-
dict(zip(getattr(Report, "unmatch_header"), [idx, mnum, key[0]]))
80+
dict(
81+
zip(
82+
getattr(Report, "unmatch_header"),
83+
[idx, mnum, key[0], closest],
84+
)
85+
)
8186
)
8287
with open(os.path.join(outdir, "anglerfish_stats.json"), "w") as f:
8388
f.write(json.dumps(json_out, indent=2, sort_keys=True))

0 commit comments

Comments
 (0)