remove wikitext tasks (borked)

sid · sid · commit 6b37cf399625 · 2021-03-22T12:12:44.000+01:00
diff --git a/main.py b/main.py
@@ -217,7 +217,8 @@ def run_eval_tasks():
     
     if args.eval:
         run_eval_tasks()
-        run_eval()
+        if params["eval_steps"] > 0:
+            run_eval()
         return
 
 
diff --git a/tasks.py b/tasks.py
@@ -11,6 +11,7 @@
 lambada_src_uri = 'http://eaidata.bmk.sh/data/lambada_test.jsonl'
 normalization = 'NFKC'
 
+
 # Note: this task is called "lambada" but it really refers to OpenAI's version
 # of the task, which actually differs in some ways from the task described in
 # the original paper. So, strictly speaking, accuracy values from this task
@@ -29,31 +30,34 @@ def lambada_create_tokens_data(params, path):
         json.dump(arrays, f)
         return arrays
 
+
 def lambada_read_or_create_tokens_data(params, path):
     # if you tell me where the file should go, i will helpfully create it for you
     if not os.path.exists(path):
         return lambada_create_tokens_data(params, path)
     with open(path) as f:
         return json.load(f)
 
+
 def bin_pack(params, tokens_data):
     eos_token = params['eos_id']
     n_ctx = params['n_ctx']
     dummy_token = 1
     pad_batch_size = params['eval_batch_size']
     bins = []
     for a in tokens_data:
-        if len(bins) == 0 or len(bins[-1])+len(a)+1 > n_ctx:
+        if len(bins) == 0 or len(bins[-1]) + len(a) + 1 > n_ctx:
             bins.append([])
         bins[-1] += a
         bins[-1].append(eos_token)
-    while len(bins)%pad_batch_size != 0:
+    while len(bins) % pad_batch_size != 0:
         bins.append([])
     bins_array = np.full((len(bins), n_ctx), dummy_token, dtype=np.uint16)
     for i, b in enumerate(bins):
         bins_array[i, 0:len(b)] = b
     return bins_array
 
+
 def lambada_init(params):
     ds_configs = params['dataset_configs']
     l = [
@@ -67,45 +71,14 @@ def lambada_init(params):
     tokens_data = lambada_read_or_create_tokens_data(params, lt_path)
     bins_array = bin_pack(params, tokens_data)
     params['lambada_tokens_path'] = lt_path
-    params['lambada_n_steps'] = len(bins_array)//params['eval_batch_size']
+    params['lambada_n_steps'] = len(bins_array) // params['eval_batch_size']
+
 
 def lambada_get_task_info(params):
     return {
         'n_steps': params['lambada_n_steps'],
     }
 
-def wikitext_detokenizer(string):
-    # contractions
-    string = string.replace("s '", "s'")
-    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
-    # number separators
-    string = string.replace(" @-@ ", "-")
-    string = string.replace(" @,@ ", ",")
-    string = string.replace(" @.@ ", ".")
-    # punctuation
-    string = string.replace(" : ", ": ")
-    string = string.replace(" ; ", "; ")
-    string = string.replace(" . ", ". ")
-    string = string.replace(" ! ", "! ")
-    string = string.replace(" ? ", "? ")
-    string = string.replace(" , ", ", ")
-    # double brackets
-    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
-    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
-    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
-    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
-    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
-    # miscellaneous
-    string = string.replace("= = = =", "====")
-    string = string.replace("= = =", "===")
-    string = string.replace("= =", "==")
-    string = string.replace(" " + chr(176) + " ", chr(176))
-    string = string.replace(" \n", "\n")
-    string = string.replace("\n ", "\n")
-    string = string.replace(" N ", " 1 ")
-    string = string.replace(" 's", "'s")
-
-    return string
 
 # The LAMBADA evaluation code looks at the logits of each position just before an eos_token
 def lambada_input(params):
@@ -115,80 +88,19 @@ def lambada_input(params):
     tokens_data = lambada_read_or_create_tokens_data(params, lt_path)
     bins_array = bin_pack(params, tokens_data)
     dataset = tf.data.Dataset.from_tensor_slices(bins_array)
-    def _get_output(bin):
-        bin = tf.cast(bin, dtype=tf.int32)
-        indexes = tf.range(n_ctx)
-        results = tf.gather(bin, (indexes+1)%n_ctx)
-        eos_next_positions = tf.math.equal(tf.gather(bin, (indexes+2)%n_ctx), eos_token)
-        output = tf.where(eos_next_positions, results, tf.constant(eos_token, shape=[n_ctx]))
-        bin = tf.reshape(bin, [n_ctx])
-        bin = tf.cast(bin, dtype=tf.int32)
-        output = tf.reshape(output, [n_ctx])
-        output = tf.cast(output, dtype=tf.int32)
-        return bin, output
-    dataset = dataset.map(_get_output)
-    dataset = dataset.batch(params['eval_batch_size'], drop_remainder=True)
-    dataset = dataset.repeat()
-    return dataset
 
-def wikitext_create_tokens_data(params, path, version):
-    assert version.lower() in ["wikitext2", "wikitext103"]
-    wikitext2_src = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip"
-    wikitext103_src = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip"
-    version_src = wikitext103_src if version.lower() == "wikitext103" else wikitext2_src
-    with open(path, 'w') as f:
-        wikitext_path = f"./{version}-raw-v1.zip"
-        os.system(f"wget {version_src} -O {wikitext_path}")
-        os.makedirs(f"{version}", exist_ok=True)
-        os.system(f"unzip {wikitext_path} -d {version}")
-        n = 103 if version.lower() == "wikitext103" else 2
-        with open(f"./{version}/wikitext-{n}-raw/wiki.test.raw", 'r') as wt:
-            text = ftfy.fix_text(wikitext_detokenizer(wt.read()))
-        enc = fetch_encoder(params)
-        encoded_text = encode(enc, text)
-        arrays = []
-        for i in range(0, len(encoded_text), params["n_ctx"] - 1):
-            arrays.append(encoded_text[i:i + params["n_ctx"] - 1])
-        json.dump(arrays, f)
-        return arrays
-
-def wikitext_read_or_create_tokens_data(params, path, version):
-    # if you tell me where the file should go, i will helpfully create it for you
-    if not os.path.exists(path):
-        return wikitext_create_tokens_data(params, path, version)
-    with open(path) as f:
-        return json.load(f)
-
-def wikitext_init(params, version):
-    wikitext_path = version + ".json"
-    tokens_data = wikitext_read_or_create_tokens_data(params, wikitext_path, version)
-    bins_array = bin_pack(params, tokens_data)
-    params['wikitext_path'] = wikitext_path
-    params['wikitext_n_steps'] = len(bins_array)//params['eval_batch_size']
-
-def wikitext_get_task_info(params, version):
-    return {
-        'n_steps': params['wikitext_n_steps'],
-    }
-
-def wikitext_input(params, version):
-    eos_token = 50256 if params['n_vocab'] >= 50257 else 0
-    n_ctx = params['n_ctx']
-    wt_path = params['wikitext_path']
-    tokens_data = wikitext_read_or_create_tokens_data(params, wt_path, version)
-    bins_array = bin_pack(params, tokens_data)
-    dataset = tf.data.Dataset.from_tensor_slices(bins_array)
     def _get_output(bin):
         bin = tf.cast(bin, dtype=tf.int32)
         indexes = tf.range(n_ctx)
-        results = tf.gather(bin, (indexes+1)%n_ctx)
-        eos_next_positions = tf.math.equal(tf.gather(bin, (indexes+2)%n_ctx), eos_token)
+        results = tf.gather(bin, (indexes + 1) % n_ctx)
+        eos_next_positions = tf.math.equal(tf.gather(bin, (indexes + 2) % n_ctx), eos_token)
         output = tf.where(eos_next_positions, results, tf.constant(eos_token, shape=[n_ctx]))
         bin = tf.reshape(bin, [n_ctx])
         bin = tf.cast(bin, dtype=tf.int32)
         output = tf.reshape(output, [n_ctx])
         output = tf.cast(output, dtype=tf.int32)
         return bin, output
+
     dataset = dataset.map(_get_output)
     dataset = dataset.batch(params['eval_batch_size'], drop_remainder=True)
     dataset = dataset.repeat()
@@ -200,15 +112,5 @@ def _get_output(bin):
         'init_fn': lambada_init,
         'get_task_info_fn': lambada_get_task_info,
         'input_fn': lambada_input,
-    },
-    'wikitext2': {
-        'init_fn': partial(wikitext_init, version='wikitext2'),
-        'get_task_info_fn': partial(wikitext_get_task_info, version='wikitext2'),
-        'input_fn': partial(wikitext_input, version='wikitext2'),
-    },
-    'wikitext103': {
-        'init_fn': partial(wikitext_init, version='wikitext103'),
-        'get_task_info_fn': partial(wikitext_get_task_info, version='wikitext103'),
-        'input_fn': partial(wikitext_input, version='wikitext103'),
-    },
+    }
 }