Merge pull request #87 from matrulda/develop-643_use_closest_read_length

johandahlberg · web-flow · commit f389fccd02f0 · 2019-12-13T10:01:15.000+01:00
Option to use closest read length
diff --git a/checkQC/app.py b/checkQC/app.py
@@ -25,9 +25,10 @@
 @click.option("--config", help="Path to the checkQC configuration file", type=click.Path())
 @click.option('--json', is_flag=True, default=False, help="Print the results of the run as json to stdout")
 @click.option('--downgrade-errors', type=str, multiple=True, help="Downgrade errors to warnings for a specific handler, can be used multiple times")
+@click.option('--use-closest-read-length', is_flag=True, default=False, help="Use the closest read length if the read length used isn't specified in the config")
 @click.version_option(checkqc_version)
 @click.argument('runfolder', type=click.Path())
-def start(config, json, downgrade_errors, runfolder):
+def start(config, json, downgrade_errors, use_closest_read_length, runfolder):
     """
     checkQC is a command line utility designed to quickly gather and assess quality control metrics from an
     Illumina sequencing run. It is highly customizable and which quality controls modules should be run
@@ -36,7 +37,7 @@ def start(config, json, downgrade_errors, runfolder):
     # -----------------------------------
     # This is the application entry point
     # -----------------------------------
-    app = App(runfolder, config, json, downgrade_errors)
+    app = App(runfolder, config, json, downgrade_errors, use_closest_read_length)
     app.run()
     sys.exit(app.exit_status)
 
@@ -46,11 +47,13 @@ class App(object):
     This is the main application object for CheckQC.
     """
 
-    def __init__(self, runfolder, config_file=None, json_mode=False, downgrade_errors_for=()):
+    def __init__(self, runfolder, config_file=None, json_mode=False,
+                 downgrade_errors_for=(), use_closest_read_length=False):
         self._runfolder = runfolder
         self._config_file = config_file
         self._json_mode = json_mode
         self._downgrade_errors_for = downgrade_errors_for
+        self._use_closest_read_length = use_closest_read_length
         self.exit_status = 0
 
     def configure_and_run(self):
@@ -73,7 +76,8 @@ def configure_and_run(self):
         # TODO For now assume symmetric read lengths
         both_read_lengths = run_type_recognizer.read_length()
         read_length = int(both_read_lengths.split("-")[0])
-        handler_config = config.get_handler_configs(instrument_and_reagent_version, read_length, self._downgrade_errors_for)
+        handler_config = config.get_handler_configs(instrument_and_reagent_version, read_length,
+                                                    self._downgrade_errors_for, self._use_closest_read_length)
 
         run_type_summary = RunTypeSummarizer.summarize(instrument_and_reagent_version, both_read_lengths, handler_config)
 
diff --git a/checkQC/config.py b/checkQC/config.py
@@ -78,7 +78,7 @@ def __init__(self, config):
         """
         self._config = config
 
-    def _get_matching_handler(self, instrument_and_reagent_type, read_length):
+    def _get_matching_handler(self, instrument_and_reagent_type, read_length, use_closest_read_length=False):
         """
         Get the handler matching the provided parameters.
 
@@ -90,7 +90,6 @@ def _get_matching_handler(self, instrument_and_reagent_type, read_length):
 
         try:
             config_read_lengths = list(map(str, self._config[instrument_and_reagent_type].keys()))
-
             for config_read_length in config_read_lengths:
                 if "-" in config_read_length:
                     split_read_length = config_read_length.split("-")
@@ -101,6 +100,10 @@ def _get_matching_handler(self, instrument_and_reagent_type, read_length):
                 else:
                     if int(read_length) == int(config_read_length):
                         return self._config[instrument_and_reagent_type][int(config_read_length)]["handlers"]
+            if use_closest_read_length:
+                closest_read_length = self._find_closest_read_length(config_read_lengths, read_length)
+                log.info(f"Read length {read_length} not find in config. Using closest read length: {closest_read_length}")
+                return self._config[instrument_and_reagent_type][closest_read_length]["handlers"]
             raise ConfigEntryMissing("Could not find a config entry matching read length '{}' on "
                                      "instrument '{}'. Please check the provided "
                                      "config.".format(read_length, instrument_and_reagent_type))
@@ -110,6 +113,28 @@ def _get_matching_handler(self, instrument_and_reagent_type, read_length):
                                      "file ".format(instrument_and_reagent_type,
                                                     read_length))
 
+    def _find_closest_read_length(self, config_read_lengths, read_length):
+        """
+        Find the closest read length in the config
+
+        :param config_read_lengths: dict with config read lengths for a specific intrument and reagent type
+        :param read_length:  either as a range, e.g. '50-70' or a single value, e.g. '50'
+        :returns: the closest read length, as a string (if interval) or int (if single value)
+        """
+        distance = {}
+        for config_read_length in sorted(config_read_lengths, reverse=True):
+            if "-" in config_read_length:
+                split_read_length = config_read_length.split("-")
+                distance[config_read_length] = min(abs(int(read_length) - int(split_read_length[0])),
+                                                   abs(int(read_length) - int(split_read_length[1])))
+            else:
+                distance[config_read_length] = abs(int(read_length) - int(config_read_length))
+        closest_read_length = min(distance, key=distance.get)
+        if "-" not in closest_read_length:
+            return int(closest_read_length)
+        else:
+            return closest_read_length
+
     def _add_default_config(self, current_handler_config):
         """
         Add the default handlers specified in the config.
@@ -144,7 +169,8 @@ def _downgrade_errors(self, current_handler_config, downgrade_errors_for):
                 downgraded_handler_config.append(handler)
         return downgraded_handler_config
 
-    def get_handler_configs(self, instrument_and_reagent_type, read_length, downgrade_errors_for=()):
+    def get_handler_configs(self, instrument_and_reagent_type, read_length,
+                            downgrade_errors_for=(), use_closest_read_length=False):
         """
         Get the handler configurations for the specified parameters.
 
@@ -154,7 +180,8 @@ def get_handler_configs(self, instrument_and_reagent_type, read_length, downgrad
         """
 
         try:
-            handler_config = self._get_matching_handler(instrument_and_reagent_type, read_length)
+            handler_config = self._get_matching_handler(instrument_and_reagent_type,
+                                                        read_length, use_closest_read_length)
             handler_config_with_defaults = self._add_default_config(handler_config)
             downgraded_handler_config_with_defaults = self._downgrade_errors(
                                                             handler_config_with_defaults,
diff --git a/checkQC/web_app.py b/checkQC/web_app.py
@@ -26,11 +26,15 @@ def initialize(self, **kwargs):
         self.monitor_path = kwargs["monitoring_path"]
         self.qc_config_file = kwargs["qc_config_file"]
         self.downgrade_errors_for = ()
+        self.use_closest_read_length = False
 
     @staticmethod
-    def _run_check_qc(monitor_path, qc_config_file, runfolder, downgrade_errors_for):
+    def _run_check_qc(monitor_path, qc_config_file, runfolder, downgrade_errors_for,
+                      use_closest_read_length):
         path_to_runfolder = os.path.join(monitor_path, runfolder)
-        checkqc_app = App(config_file=qc_config_file, runfolder=path_to_runfolder, downgrade_errors_for=downgrade_errors_for)
+        checkqc_app = App(config_file=qc_config_file, runfolder=path_to_runfolder,
+                          downgrade_errors_for=downgrade_errors_for,
+                          use_closest_read_length=use_closest_read_length)
         reports = checkqc_app.configure_and_run()
         reports["version"] = checkqc_version
         return reports
@@ -43,8 +47,12 @@ def _write_error(self, status_code, reason):
     def get(self, runfolder):
         if "downgrade" in self.request.query_arguments:
             self.downgrade_errors_for = self.get_query_argument("downgrade")
+        if "useClosestReadLength" in self.request.query_arguments:
+            self.use_closest_read_length = True
         try:
-            reports = self._run_check_qc(self.monitor_path, self.qc_config_file, runfolder, self.downgrade_errors_for)
+            reports = self._run_check_qc(self.monitor_path, self.qc_config_file,
+                                         runfolder, self.downgrade_errors_for,
+                                         self.use_closest_read_length)
             self.set_header("Content-Type", "application/json")
             self.write(reports)
         except RunfolderNotFoundError:
diff --git a/docs/index.rst b/docs/index.rst
@@ -237,6 +237,24 @@ This parameter can be supplied to the webservice as a query argument:
 
   curl -s -w'\n' localhost:9999/qc/170726_D00118_0303_BCB1TVANXX?downgrade=ReadsPerSampleHandler,UndeterminedPercentageHandler | python -m json.tool
 
+  Use closest read length
+  ------------------------------
+
+  It is possible to instruct CheckQC to use the closest read length if the read length of the run is not found in the config.
+  In case of a tie between two read lengths, the longer read length (with stricter QC criteria) will be used.
+
+  Usage:
+
+  .. code-block :: console
+
+    $ checkqc --use-closest-read-length <RUNFOLDER>
+
+  This parameter can be supplied to the webservice as a query argument:
+
+  .. code-block :: console
+
+    curl -s -w'\n' localhost:9999/qc/170726_D00118_0303_BCB1TVANXX?useClosestReadLength | python -m json.tool
+
 Running CheckQC as a webservice
 -------------------------------
 
diff --git a/tests/resources/read_length_not_in_config.yaml b/tests/resources/read_length_not_in_config.yaml
@@ -0,0 +1,45 @@
+# NOTE
+# ----
+# This config is a partial copy of checkQC/default_config/config.yaml,
+# where the read length 126 is not specified for hiseq2500_rapidhighoutput_v4
+
+# Use this section to provide configuration options to the parsers
+parser_configurations:
+  StatsJsonParser:
+    # Path to where the bcl2fastq output (i.e. fastq files, etc) is located relative to
+    # the runfolder
+    bcl2fastq_output_path: Data/Intensities/BaseCalls
+  SamplesheetParser:
+    samplesheet_name: SampleSheet.csv
+
+default_handlers:
+  - name: UndeterminedPercentageHandler
+    warning: unknown
+    error: 9 # <% Phix on lane> + < value as %>
+  - name: UnidentifiedIndexHandler
+    significance_threshold: 1 # % of reads in unidentified
+    # Indexes which are white-listed will only cause a warning even if they occur
+    # above the significance level.
+    # They will be matched like regular expressions,
+    # so e.g. NNN will match exactly three NNNs, while
+    # N{3,} will match three or more Ns.
+    white_listed_indexes:
+      - .*N.*
+      - G{6,}
+
+hiseq2500_rapidhighoutput_v4:
+  131:
+    handlers:
+      - name: ClusterPFHandler
+        warning: 180 # Millons of clusters
+        error: unknown
+      - name: Q30Handler
+        warning: 80 # Give percentage for reads greater than Q30
+        error: unknown # Give percentage for reads greater than Q30
+      - name: ErrorRateHandler
+        allow_missing_error_rate: False
+        warning: 2
+        error: unknown
+      - name: ReadsPerSampleHandler
+        warning: unknown
+        error: 90 # 50 % of threshold for clusters pass filter
diff --git a/tests/test_app.py b/tests/test_app.py
@@ -19,11 +19,16 @@ def test_run_json_mode(self):
         # The test data contains fatal qc errors
         self.assertEqual(app.run(), 1)
 
+    def test_run_use_closest_read_length(self):
+        config_file = os.path.join("tests", "resources", "read_length_not_in_config.yaml")
+        app = App(runfolder=self.RUNFOLDER, config_file=config_file, use_closest_read_length=True)
+        # The test data contains fatal qc errors
+        self.assertEqual(app.run(), 1)
+
     def test_run_downgrade_error(self):
         app = App(runfolder=self.RUNFOLDER, downgrade_errors_for="ReadsPerSampleHandler")
         # Test data should not produce fatal qc errors anymore
         self.assertEqual(app.run(), 0)
 
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_config.py b/tests/test_config.py
@@ -20,6 +20,12 @@ def setUp(self):
                            '150-299': {'handlers': [
                                self.second_handler]}
                        },
+                       'hiseqx': {
+                           50: {'handlers': [
+                               self.first_handler]},
+                           52: {'handlers': [
+                               self.second_handler]}
+                       },
                        "default_handlers": [
                            self.default_handler,
                            self.first_handler
@@ -55,6 +61,18 @@ def test_downgrade_errors(self):
         self.assertEqual(result[1]["error"], "unknown")
         self.assertEqual(result[1]["warning"], 100)
 
+    def test_use_closest_read_length(self):
+        result = self.config._get_matching_handler('miseq_v3', 149, use_closest_read_length=True)
+        self.assertEqual(result, [self.second_handler])
+
+    def test_use_closest_read_length_in_the_middle(self):
+        result = self.config._get_matching_handler('hiseqx', 51, use_closest_read_length=True)
+        self.assertEqual(result, [self.second_handler])
+
+    def test_machine_and_reagent_type_not_found(self):
+        with self.assertRaises(ConfigEntryMissing):
+            self.config._get_matching_handler('foo', 51, use_closest_read_length=True)
+
 
 class TestConfigFactory(unittest.TestCase):
 
diff --git a/tests/test_web_app.py b/tests/test_web_app.py
@@ -42,3 +42,25 @@ def get_app(self):
     def test_qc_fail_fast_for_unknown_config(self):
         response = self.fetch('/qc/170726_D00118_0303_BCB1TVANXX')
         self.assertEqual(response.code, 500)
+
+
+class TestWebAppReadLengthNotInConfig(AsyncHTTPTestCase):
+
+    def get_app(self):
+        routes = WebApp._routes(monitoring_path=os.path.join("tests", "resources"),
+                                qc_config_file=os.path.join("tests", "resources", "read_length_not_in_config.yaml"))
+        return tornado.web.Application(routes)
+
+    def test_use_closest_read_length(self):
+        response = self.fetch('/qc/170726_D00118_0303_BCB1TVANXX?useClosestReadLength')
+        result = json.loads(response.body)
+        self.assertEqual(response.code, 200)
+        # Test data produce fatal qc errors
+        self.assertEqual(result["exit_status"], 1)
+
+    def test_use_closest_read_length_and_downgrade_errors(self):
+        response = self.fetch('/qc/170726_D00118_0303_BCB1TVANXX?useClosestReadLength&downgrade=ReadsPerSampleHandler')
+        result = json.loads(response.body)
+        self.assertEqual(response.code, 200)
+        # Test data produce fatal qc errors
+        self.assertEqual(result["exit_status"], 0)