Merge pull request #83 from remiolsen/unknown_improvements

remiolsen · web-flow · commit 654b27103a82 · 2024-05-28T11:30:41.000+02:00
Finalising 0.7 release
diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py
@@ -9,6 +9,7 @@
 from collections import Counter
 from itertools import groupby
 
+import Levenshtein as lev
 import numpy as np
 import pkg_resources
 
@@ -34,7 +35,6 @@ def run_demux(args):
     if args.debug:
         log.setLevel(logging.DEBUG)
     run_uuid = str(uuid.uuid4())
-    os.mkdir(args.out_fastq)
     ss = SampleSheet(args.samplesheet, args.ont_barcodes)
     version = pkg_resources.get_distribution("bio-anglerfish").version
     report = Report(args.run_name, run_uuid, version)
@@ -80,7 +80,12 @@ def run_demux(args):
         adaptors_sorted[(entry.adaptor.name, entry.ont_barcode)].append(
             (entry.sample_name, entry.adaptor, os.path.abspath(entry.fastq))
         )
-
+    if os.path.exists(args.out_fastq):
+        raise FileExistsError(
+            f"Output folder '{args.out_fastq}' already exists. Please remove it or specify another --run_name"
+        )
+    else:
+        os.mkdir(args.out_fastq)
     out_fastqs = []
     for key, sample in adaptors_sorted.items():
         adaptor_name, ont_barcode = key
@@ -181,6 +186,7 @@ def run_demux(args):
                     f" Lenient mode: Reverse complementing {best_flip} index for adaptor {adaptor_name} found at least {args.lenient_factor} times more matches"
                 )
                 no_matches, matches = flipped[best_flip]
+                flipped_i7, flipped_i5 = flips[best_flip].values()
             else:
                 log.info(
                     f" Lenient mode: using original index orientation for {adaptor_name}"
@@ -238,11 +244,41 @@ def run_demux(args):
 
         # Top unmatched indexes
         nomatch_count = Counter([x[3] for x in no_matches])
-        if args.max_unknowns is None:
+        if args.max_unknowns == 0:
             args.max_unknowns = len([sample for sample in ss]) + 10
-        report.add_unmatched_stat(
-            nomatch_count.most_common(args.max_unknowns), ont_barcode, adaptor_name
-        )
+
+        # We search for the closest sample in the samplesheet to the list of unknowns
+        top_unknowns = []
+        for i in nomatch_count.most_common(args.max_unknowns):
+            sample_dists = [
+                (
+                    lev.distance(
+                        i[0], f"{x.adaptor.i7_index}+{x.adaptor.i5_index}".lower()
+                    ),
+                    x.sample_name,
+                )
+                for x in ss
+            ]
+            closest_sample = min(sample_dists, key=lambda x: x[0])
+            # If the distance is more than half the index length, we remove it
+            if closest_sample[0] >= (len(i[0]) / 2) + 1:
+                closest_sample = (closest_sample[0], None)
+            else:
+                # We might have two samples with the same distance
+                all_min = [
+                    x[1]
+                    for x in sample_dists
+                    if x[0] == closest_sample[0] and x[1] != closest_sample[1]
+                ]
+                # This list might be too long, so we truncate it
+                if len(all_min) > 4:
+                    all_min = all_min[:4]
+                    all_min.append("...")
+                if all_min:
+                    closest_sample = (closest_sample[0], ";".join(all_min))
+
+            top_unknowns.append([i[0], i[1], closest_sample[1]])
+        report.add_unmatched_stat(top_unknowns, ont_barcode, adaptor_name)
 
     # Check if there were samples in the samplesheet without adaptor alignments and add them to report
     for entry in ss:
diff --git a/anglerfish/cli.py b/anglerfish/cli.py
@@ -269,6 +269,9 @@ def run(
     )
     utcnow = dt.datetime.now(dt.timezone.utc)
     runname = utcnow.strftime(f"{args.run_name}_%Y_%m_%d_%H%M%S")
+    if run_name != "anglerfish":
+        runname = args.run_name
+
     assert os.path.exists(args.out_fastq), f"Output folder '{args.out_fastq}' not found"
     assert os.path.exists(
         args.samplesheet
diff --git a/anglerfish/demux/report.py b/anglerfish/demux/report.py
@@ -8,7 +8,7 @@
 
 
 class Report:
-    unmatch_header = ["index", "num_reads", "ont_barcode"]
+    unmatch_header = ["index", "num_reads", "ont_barcode", "closest_match"]
 
     def __init__(self, run_name, uuid, version):
         self.run_name = run_name
@@ -44,8 +44,8 @@ def write_report(self, outdir):
             uhead = getattr(Report, "unmatch_header")
             f.write(f"\n{chr(9).join(uhead)}\n")  # chr(9) = tab
             for key, unmatch in self.unmatched_stats.items():
-                for idx, mnum in unmatch:
-                    f.write(f"{idx}\t{mnum}\t{key[0]}\n")
+                for idx, mnum, closest in unmatch:
+                    f.write(f"{idx}\t{mnum}\t{key[0]}\t{closest}\n")
         log.debug(
             f"Wrote anglerfish_stats.txt to {outdir}, size: {os.path.getsize(os.path.join(outdir, 'anglerfish_stats.txt'))} bytes"
         )
@@ -75,9 +75,14 @@ def write_json(self, outdir):
                 dict(zip(getattr(SampleStat, "header"), slist))
             )
         for key, unmatch in self.unmatched_stats.items():
-            for idx, mnum in unmatch:
+            for idx, mnum, closest in unmatch:
                 json_out["undetermined"].append(
-                    dict(zip(getattr(Report, "unmatch_header"), [idx, mnum, key[0]]))
+                    dict(
+                        zip(
+                            getattr(Report, "unmatch_header"),
+                            [idx, mnum, key[0], closest],
+                        )
+                    )
                 )
         with open(os.path.join(outdir, "anglerfish_stats.json"), "w") as f:
             f.write(json.dumps(json_out, indent=2, sort_keys=True))