publish dataset & add README

L-M-Sherlock · L-M-Sherlock · commit 62f30eb34afb · 2022-02-09T18:52:38.000+08:00
diff --git a/README.md b/README.md
@@ -1 +1,47 @@
-# SSP-MMC
+# SSP-MMC
+
+Copyright (c) 2022 [MaiMemo](https://www.maimemo.com/), Inc. MIT License.
+
+Stochastic-Shortest-Path-Minimize-Memorization-Cost (SSP-MMC) is a spaced repetition scheduling algorithm used to help learners remember more words in MaiMemo, a language learning application in China.
+
+This repository contains a public release of the data and code used for several experiments in the following paper (which introduces SSP-MMC):
+
+> Waiting for the result of SIGKDD2022
+
+# Software
+
+The file `data_preprocessing.py` is used to preprocess data for the DHP model.
+
+The file `cal_model_param.py` contains the DHP model and HLR model.
+
+The file `model/utils.py` saves the parameters of the DHP model for training and simulation.
+
+The file `algo/main.cpp` contains a Cpp implementation of SSP-MMC, which aims at finding the optimal policy.
+
+The file `simulator.py` provides an environment for comparing different scheduling algorithms.
+
+## Workflow
+
+1. Run `data_preprocessing.py` -> `halflife_for_fit.tsv`
+2. Run `cal_model_param.py` -> `intercept_` and `coef_` for the DHP model
+3. Save the parameters to the function `cal_recall_halflife` and ` cal_forget_halflife` in  `model/utils.py` and the function `cal_next_recall_halflife` in `algo/main.cpp`
+4. Run `algo/main.cpp` -> optimal policy in `algo/result/`
+5. Run `simulator.py` to compare the SSP-MMC with several baselines.
+
+## Data Set and Format
+
+The dataset is available on [Dataverse](https://doi.org/10.7910/DVN/VAGUL0) (1.6 GB). This is a 7zipped TSV file containing our experiments' 220 million MaiMemo student memory behavior logs.
+
+The columns are as follows:
+
+- `u` - student user ID who reviewed the word (anonymized)
+- `w` - spelling of the word 
+
+- `i` - total times the user has reviewed the word
+- `d` - difficulty of the word
+- `t_history` - interval sequence of the historic reviews
+- `r_history` - recall sequence of the historic reviews
+- `delta_t` - time elapsed from the last review
+- `r` - result of the review
+- `p_recall` - probability of recall
+- `total_cnt` - number of users who did the same memory behavior
diff --git a/algo/main.cpp b/algo/main.cpp
@@ -22,9 +22,9 @@ float cal_start_halflife(int difficulty) {
 
 float cal_next_recall_halflife(float h, float p, int d, int recall) {
     if (recall == 1) {
-        return h * (1 + exp(3.80863264) * pow(d, -0.53420593) * pow(h, -0.127362) * pow(1 - p, 0.967804));
+        return h * (1 + exp(3.81) * pow(d, -0.534) * pow(h, -0.127) * pow(1 - p, 0.97));
     } else {
-        return exp(-0.04158382) * pow(d, -0.04067209) * pow(h, 0.37745957) * pow(1 - p, -0.22724425);
+        return exp(-0.041) * pow(d, -0.041) * pow(h, 0.377) * pow(1 - p, -0.227);
     }
 }
 
diff --git a/algo/result/README.md b/algo/result/README.md
@@ -0,0 +1,7 @@
+# Introduction
+
+The CSV files whose name begin with `cost` record the expected review cost for each memory state.
+
+The CSV files whose name begin with `ivl` record the optimal review interval for each memory state.
+
+The CSV files whose name begin with `recall` record the recall probability corresponding to the optimal review interval for each memory state.
diff --git a/cal_model_param.py b/cal_model_param.py
@@ -75,11 +75,11 @@ def fit_recall_halflife(raw):
     raw['predict_halflife_hlr'] = y_pred
     fig = go.Figure()
     fig.add_trace(
-        go.Scatter(x=raw['halflife'], y=raw['predict_halflife_dhp'], marker_size=np.log(raw['group_cnt']) / 2,
+        go.Scatter(x=raw['halflife'], y=raw['predict_halflife_dhp'], marker_size=np.log(raw['group_cnt']),
                    mode='markers',
                    name='DHP'))
     fig.add_trace(
-        go.Scatter(x=raw['halflife'], y=raw['predict_halflife_hlr'], marker_size=np.log(raw['group_cnt']) / 2,
+        go.Scatter(x=raw['halflife'], y=raw['predict_halflife_hlr'], marker_size=np.log(raw['group_cnt']),
                    mode='markers',
                    name='HLR', opacity=0.7))
     fig.update_xaxes(title_text='observed half-life after recall')
@@ -147,11 +147,11 @@ def fit_forget_halflife(raw):
     raw['predict_halflife_hlr'] = y_pred
     fig = go.Figure()
     fig.add_trace(
-        go.Scatter(x=raw['halflife'], y=raw['predict_halflife_dhp'], marker_size=np.log(raw['group_cnt']) / 2,
+        go.Scatter(x=raw['halflife'], y=raw['predict_halflife_dhp'], marker_size=np.log(raw['group_cnt']),
                    mode='markers',
                    name='DHP'))
     fig.add_trace(
-        go.Scatter(x=raw['halflife'], y=raw['predict_halflife_hlr'], marker_size=np.log(raw['group_cnt']) / 2,
+        go.Scatter(x=raw['halflife'], y=raw['predict_halflife_hlr'], marker_size=np.log(raw['group_cnt']),
                    mode='markers',
                    name='HLR', opacity=0.7))
     fig.update_xaxes(title_text='observed half-life after forget')
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,6 @@
+# Introduction
+
+Please unzip the dataset files to this directory. The necessary files include:
+
+- `opensource_dataset_difficulty.tsv`
+- `opensource_dataset_forgetting_curve.tsv`
diff --git a/fit_data.py b/fit_data.py
@@ -3,7 +3,6 @@
 import sys
 import os
 import time
-import numpy as np
 import pandas as pd
 import math
 from collections import namedtuple
diff --git a/model/utils.py b/model/utils.py
@@ -23,15 +23,16 @@ def cal_start_halflife(difficulty):
 
 def cal_recall_halflife(difficulty, halflife, p_recall):
     return halflife * (
-            1 + np.exp(3.80863264) * np.power(difficulty, -0.53420593) * np.power(halflife, -0.127362) * np.power(
-        1 - p_recall, 0.9678043))
+            1 + np.exp(3.81) * np.power(difficulty, -0.534) * np.power(halflife, -0.127) * np.power(
+        1 - p_recall, 0.97))
 
 
 def cal_forget_halflife(difficulty, halflife, p_recall):
-    return np.exp(-0.04158382) * np.power(difficulty, -0.04067209) * np.power(halflife, 0.37745957) * np.power(
-        1 - p_recall, -0.22724425)
+    return np.exp(-0.041) * np.power(difficulty, -0.041) * np.power(halflife, 0.377) * np.power(
+        1 - p_recall, -0.227)
 
 
+# the following code is from https://github.com/Networks-Learning/memorize
 def intensity(t, n_t, q):
     return 1.0 / np.sqrt(q) * (1 - np.exp(-n_t * t))
 
diff --git a/plot/README.md b/plot/README.md
@@ -0,0 +1,5 @@
+# Introduction
+
+The PDF files show the statistical information about the dataset.
+
+The HTML files are the dynamic version.
diff --git a/simulation_result/README.md b/simulation_result/README.md
@@ -0,0 +1,5 @@
+# Introduction
+
+The TSV files record the review history of each word during the simulation. Their names represent the scheduling algorithm they used.
+
+The PDF files show the processes of simulation.
diff --git a/simulator.py b/simulator.py
@@ -200,23 +200,23 @@ def scheduler(difficulty, halflife, reps, lapses, method):
         total_cost = int(sum(cost_per_day))
 
         plt.figure(1)
-        plt.plot(record_per_day, label=f'{method}', linewidth=0.8)
+        plt.plot(record_per_day, label=f'{method}')
 
         plt.figure(2)
-        plt.plot(meet_target_per_day, label=f'{method}', linewidth=0.8)
+        plt.plot(meet_target_per_day, label=f'{method}')
         cost_day = np.argmax(meet_target_per_day >= compare_target)
         if cost_day > 0:
             print(f'cost day: {cost_day}')
             plt.plot(cost_day, compare_target, 'k*', linewidth=2)
 
         plt.figure(3)
-        plt.plot(new_item_per_day_average_per_period, label=f'{method}', linewidth=0.8)
+        plt.plot(new_item_per_day_average_per_period, label=f'{method}')
 
         plt.figure(4)
-        plt.plot(cost_per_day_average_per_period, label=f'{method}', linewidth=0.8)
+        plt.plot(cost_per_day_average_per_period, label=f'{method}')
 
         plt.figure(5)
-        plt.plot(learned_per_day, label=f'{method}', linewidth=0.8)
+        plt.plot(learned_per_day, label=f'{method}')
 
         print('acc learn', total_learned)
         print('meet target', meet_target)
@@ -236,7 +236,7 @@ def scheduler(difficulty, halflife, reps, lapses, method):
         # plt.plot(....)
         pdf.savefig()
     plt.figure(2)
-    plt.plot((0, learn_days), (compare_target, compare_target), color='black', linestyle='dotted', linewidth=0.8)
+    plt.plot((0, learn_days), (compare_target, compare_target), color='black', linestyle='dotted')
     plt.title(f"day cost limit:{day_cost_limit}-learn days:{learn_days}")
     plt.xlabel("days")
     plt.ylabel("THR")
diff --git a/visualization.py b/visualization.py
@@ -234,8 +234,8 @@ def policy_action_visualize():
 
 
 if __name__ == "__main__":
-    # difficulty_visualize()
-    # forgetting_curve_visualize()
-    # raw_data_visualize()
-    # dhp_model_visualize()
+    difficulty_visualize()
+    forgetting_curve_visualize()
+    raw_data_visualize()
+    dhp_model_visualize()
     policy_action_visualize()

Original file line number	Diff line number	Diff line change
`@@ -22,9 +22,9 @@ float cal_start_halflife(int difficulty) {`
`22`	`22`
`23`	`23`	`float cal_next_recall_halflife(float h, float p, int d, int recall) {`
`24`	`24`	`if (recall == 1) {`
`25`		`- return h * (1 + exp(3.80863264) * pow(d, -0.53420593) * pow(h, -0.127362) * pow(1 - p, 0.967804));`
	`25`	`+ return h * (1 + exp(3.81) * pow(d, -0.534) * pow(h, -0.127) * pow(1 - p, 0.97));`
`26`	`26`	`} else {`
`27`		`- return exp(-0.04158382) * pow(d, -0.04067209) * pow(h, 0.37745957) * pow(1 - p, -0.22724425);`
	`27`	`+ return exp(-0.041) * pow(d, -0.041) * pow(h, 0.377) * pow(1 - p, -0.227);`
`28`	`28`	`}`
`29`	`29`	`}`
`30`	`30`