mit-han-lab
diff --git a/‎HAT_ACL.pdf
-2.29 MB b/‎HAT_ACL.pdf
-2.29 MB
diff --git a/‎LICENSE
Lines changed: 50 additions & 0 deletions b/‎LICENSE
Lines changed: 50 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 208 additions & 10 deletions b/‎README.md
Lines changed: 208 additions & 10 deletions
diff --git a/‎average_checkpoints.py
Lines changed: 140 additions & 0 deletions b/‎average_checkpoints.py
Lines changed: 140 additions & 0 deletions
diff --git a/‎configs/iwslt14.de-en/average_checkpoint.sh
Lines changed: 10 additions & 0 deletions b/‎configs/iwslt14.de-en/average_checkpoint.sh
Lines changed: 10 additions & 0 deletions
diff --git a/‎configs/iwslt14.de-en/evo_search/iwslt14deen_titanxp.yml
Lines changed: 22 additions & 0 deletions b/‎configs/iwslt14.de-en/evo_search/iwslt14deen_titanxp.yml
Lines changed: 22 additions & 0 deletions
diff --git a/‎configs/iwslt14.de-en/get_preprocessed.sh
Lines changed: 9 additions & 0 deletions b/‎configs/iwslt14.de-en/get_preprocessed.sh
Lines changed: 9 additions & 0 deletions
diff --git a/‎configs/iwslt14.de-en/latency_dataset/gpu_titanxp.yml
Lines changed: 49 additions & 0 deletions b/‎configs/iwslt14.de-en/latency_dataset/gpu_titanxp.yml
Lines changed: 49 additions & 0 deletions
diff --git a/‎configs/iwslt14.de-en/latency_predictor/gpu_titanxp.yml
Lines changed: 10 additions & 0 deletions b/‎configs/iwslt14.de-en/latency_predictor/gpu_titanxp.yml
Lines changed: 10 additions & 0 deletions
@@ -0,0 +1,50 @@
+MIT License
+------------ LICENSE For Hardware-Aware Transformer software ---------------
+Copyright (c) 2020, Hanrui Wang, Zhanghao Wu, Zhijian Liu, Han Cai,
+Ligeng Zhu, Chuang Gan and Song Han
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------- LICENSE FOR Fairseq ------------------------------
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import collections
+import torch
+import os
+import re
+
+
+def average_checkpoints(inputs):
+    """Loads checkpoints from inputs and returns a model with averaged weights.
+
+    Args:
+      inputs: An iterable of string paths of checkpoints to load from.
+
+    Returns:
+      A dict of string keys mapping to various values. The 'model' key
+      from the returned dict should correspond to an OrderedDict mapping
+      string parameter names to torch Tensors.
+    """
+    params_dict = collections.OrderedDict()
+    params_keys = None
+    new_state = None
+    num_models = len(inputs)
+
+    for f in inputs:
+        state = torch.load(
+            f,
+            map_location=(
+                lambda s, _: torch.serialization.default_restore_location(s, 'cpu')
+            ),
+        )
+        # Copies over the settings from the first checkpoint
+        if new_state is None:
+            new_state = state
+
+        model_params = state['model']
+
+        model_params_keys = list(model_params.keys())
+        if params_keys is None:
+            params_keys = model_params_keys
+        elif params_keys != model_params_keys:
+            raise KeyError(
+                'For checkpoint {}, expected list of params: {}, '
+                'but found: {}'.format(f, params_keys, model_params_keys)
+            )
+
+        for k in params_keys:
+            p = model_params[k]
+            if isinstance(p, torch.HalfTensor):
+                p = p.float()
+            if k not in params_dict:
+                params_dict[k] = p.clone()
+                # NOTE: clone() is needed in case of p is a shared parameter
+            else:
+                params_dict[k] += p
+
+    averaged_params = collections.OrderedDict()
+    for k, v in params_dict.items():
+        averaged_params[k] = v
+        averaged_params[k].div_(num_models)
+    new_state['model'] = averaged_params
+    return new_state
+
+
+def last_n_checkpoints(paths, n, update_based, upper_bound=None):
+    assert len(paths) == 1
+    path = paths[0]
+    if update_based:
+        pt_regexp = re.compile(r'checkpoint_\d+_(\d+)\.pt')
+    else:
+        pt_regexp = re.compile(r'checkpoint(\d+)\.pt')
+    files = os.listdir(path)
+
+    entries = []
+    for f in files:
+        m = pt_regexp.fullmatch(f)
+        if m is not None:
+            sort_key = int(m.group(1))
+            if upper_bound is None or sort_key <= upper_bound:
+                entries.append((sort_key, m.group(0)))
+    if len(entries) < n:
+        raise Exception('Found {} checkpoint files but need at least {}', len(entries), n)
+    return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)[:n]]
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Tool to average the params of input checkpoints to '
+                    'produce a new checkpoint',
+    )
+    # fmt: off
+    parser.add_argument('--inputs', required=True, nargs='+',
+                        help='Input checkpoint file paths.')
+    parser.add_argument('--output', required=True, metavar='FILE',
+                        help='Write the new checkpoint containing the averaged weights to this path.')
+    num_group = parser.add_mutually_exclusive_group()
+    num_group.add_argument('--num-epoch-checkpoints', type=int,
+                           help='if set, will try to find checkpoints with names checkpoint_xx.pt in the path specified by input, '
+                           'and average last this many of them.')
+    num_group.add_argument('--num-update-checkpoints', type=int,
+                           help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by input, '
+                           'and average last this many of them.')
+    parser.add_argument('--checkpoint-upper-bound', type=int,
+                        help='when using --num-epoch-checkpoints, this will set an upper bound on which checkpoint to use, '
+                        'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.')
+    # fmt: on
+    args = parser.parse_args()
+    print(args)
+
+    num = None
+    is_update_based = False
+    if args.num_update_checkpoints is not None:
+        num = args.num_update_checkpoints
+        is_update_based = True
+    elif args.num_epoch_checkpoints is not None:
+        num = args.num_epoch_checkpoints
+
+    assert args.checkpoint_upper_bound is None or args.num_epoch_checkpoints is not None, \
+        '--checkpoint-upper-bound requires --num-epoch-checkpoints'
+    assert args.num_epoch_checkpoints is None or args.num_update_checkpoints is None, \
+        'Cannot combine --num-epoch-checkpoints and --num-update-checkpoints'
+
+    if num is not None:
+        args.inputs = last_n_checkpoints(
+            args.inputs, num, is_update_based, upper_bound=args.checkpoint_upper_bound,
+        )
+        print('averaging checkpoints: ', args.inputs)
+
+    new_state = average_checkpoints(args.inputs)
+    torch.save(new_state, args.output)
+    print('Finished writing averaged checkpoint to {}.'.format(args.output))
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,10 @@
+checkpoints_path=$1
+avg_checkpoints=${2:-10}
+
+model=average_model_$avg_checkpoints.pt
+output_path=$checkpoints_path
+
+python average_checkpoints.py \
+  --inputs $output_path \
+  --num-epoch-checkpoints $avg_checkpoints \
+  --output $output_path/$model
@@ -0,0 +1,22 @@
+evo-iter: 30
+population-size: 125
+parent-size: 25
+mutation-size: 50
+crossover-size: 50
+mutation-prob: 0.3
+
+
+# path to load latency predictor
+ckpt-path: ./latency_dataset/predictors/iwslt14deen_gpu_titanxp.pt
+# feature-norm should match with that when train the latency predictor
+feature-norm: [640, 6, 2048, 6, 640, 6, 2048, 6, 6, 2]
+# lat-norm should match with that when train the latency predictor
+lat-norm: 200
+# path to load supertransformer weights
+restore-file: ./downloaded_models/HAT_iwslt14deen_super_space1.pt
+
+
+# path to write subtransformer configs
+write-config-path: configs/iwslt14.de-en/subtransformer/[email protected]
+# latency constraint
+latency-constraint: 200
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+mkdir -p data/binary/iwslt14_de_en
+
+wget -O data/binary/iwslt14_de_en/iwslt14_de_en.preprocessed.tgz 'https://www.dropbox.com/s/t5dqiamjdzahhfc/iwslt14_de_en.preproessed.tgz?dl=0'
+
+cd data/binary/iwslt14_de_en
+
+tar -xzvf iwslt14_de_en.preprocessed.tgz
@@ -0,0 +1,49 @@
+lat-dataset-path: ./latency_dataset/iwslt14deen_gpu_titanxp.csv
+lat-dataset-size: 2000
+latgpu: True
+latiter: 20
+latsilent: True
+
+
+# below is the configs for the data point sampling space for the latency predictor
+
+# model
+arch: transformersuper_iwslt_de_en
+max-tokens: 4096
+data: data/binary/iwslt14_de_en
+source-lang: de
+target-lang: en
+
+# SuperTransformer configs
+encoder-embed-dim: 640
+decoder-embed-dim: 640
+
+encoder-ffn-embed-dim: 3072
+decoder-ffn-embed-dim: 3072
+
+encoder-layers: 6
+decoder-layers: 6
+
+encoder-attention-heads: 8
+decoder-attention-heads: 8
+
+
+qkv-dim: 512
+
+# SubTransformers search space
+encoder-embed-choice: [640, 512]
+decoder-embed-choice: [640, 512]
+
+encoder-ffn-embed-dim-choice: [3072, 2048, 1024, 512]
+decoder-ffn-embed-dim-choice: [3072, 2048, 1024, 512]
+
+encoder-layer-num-choice: [6]
+decoder-layer-num-choice: [6, 5, 4, 3, 2, 1]
+
+encoder-self-attention-heads-choice: [8, 4, 2]
+decoder-self-attention-heads-choice: [8, 4, 2]
+decoder-ende-attention-heads-choice: [8, 4, 2]
+
+# for arbitrary encoder decoder attention. -1 means attending to last one encoder layer
+# 1 means last two encoder layers, 2 means last three encoder layers
+decoder-arbitrary-ende-attn-choice: [-1, 1, 2]
@@ -0,0 +1,10 @@
+lat-dataset-path: ./latency_dataset/iwslt14deen_gpu_titanxp_all.csv
+feature-norm: [640, 6, 2048, 6, 640, 6, 2048, 6, 6, 2]
+lat-norm: 200
+feature-dim: 10
+hidden-dim: 400
+hidden-layer-num: 3
+ckpt-path: ./latency_dataset/predictors/iwslt14deen_gpu_titanxp.pt
+train-steps: 5000
+bsz: 128
+lr: 1e-5