Skip to content

Commit a22db3b

Browse files
committed
Update preprocessing scripts
1 parent 5731083 commit a22db3b

File tree

1 file changed

+24
-33
lines changed

1 file changed

+24
-33
lines changed

nb/extract_netku.ipynb

+24-33
Original file line numberDiff line numberDiff line change
@@ -1573,25 +1573,41 @@
15731573
},
15741574
{
15751575
"cell_type": "code",
1576-
"execution_count": 52,
1576+
"execution_count": 11,
15771577
"id": "bb62fa4e-e331-4f0f-ae0a-fb5eece8ee71",
15781578
"metadata": {},
1579-
"outputs": [],
1579+
"outputs": [
1580+
{
1581+
"data": {
1582+
"text/plain": [
1583+
"168"
1584+
]
1585+
},
1586+
"execution_count": 11,
1587+
"metadata": {},
1588+
"output_type": "execute_result"
1589+
}
1590+
],
15801591
"source": [
1581-
"hyps = csv_file.ref.to_list()"
1592+
"# hyps = csv_file.ref.to_list()\n",
1593+
"hyps = []\n",
1594+
"with open(\"/Users/quert/Documents/GitHub/edit_NetKu/dataset/same_secs_insert_labeled/final_hyp.src\", \"r\") as f:\n",
1595+
" for line in f.readlines():\n",
1596+
" hyps.append(line.strip())\n",
1597+
"len(hyps)"
15821598
]
15831599
},
15841600
{
15851601
"cell_type": "code",
1586-
"execution_count": 53,
1602+
"execution_count": 12,
15871603
"id": "7871b42c-1dbc-4bcb-b104-23bb2d4b26a3",
15881604
"metadata": {},
15891605
"outputs": [
15901606
{
15911607
"name": "stderr",
15921608
"output_type": "stream",
15931609
"text": [
1594-
"Token indices sequence length is longer than the specified maximum sequence length for this model (3779 > 1024). Running this sequence through the model will result in indexing errors\n"
1610+
"Token indices sequence length is longer than the specified maximum sequence length for this model (6239 > 1024). Running this sequence through the model will result in indexing errors\n"
15951611
]
15961612
}
15971613
],
@@ -1607,49 +1623,24 @@
16071623
},
16081624
{
16091625
"cell_type": "code",
1610-
"execution_count": 54,
1626+
"execution_count": 13,
16111627
"id": "ce1769ed-b6e6-4c99-ab6d-7a4e567ddaa3",
16121628
"metadata": {},
16131629
"outputs": [
16141630
{
16151631
"data": {
16161632
"text/plain": [
1617-
"(34, 67597, 5766.785714285715, 3167.0)"
1633+
"(76, 86560, 7774.184523809524, 4387.5)"
16181634
]
16191635
},
1620-
"execution_count": 54,
1636+
"execution_count": 13,
16211637
"metadata": {},
16221638
"output_type": "execute_result"
16231639
}
16241640
],
16251641
"source": [
16261642
"np.min(lengths),np.max(lengths),np.mean(lengths), np.median(lengths)"
16271643
]
1628-
},
1629-
{
1630-
"cell_type": "code",
1631-
"execution_count": 10,
1632-
"id": "5c277ded-9567-44e5-9425-d5743af04d0e",
1633-
"metadata": {},
1634-
"outputs": [
1635-
{
1636-
"data": {
1637-
"text/plain": [
1638-
"(73846, 10425, 11253)"
1639-
]
1640-
},
1641-
"execution_count": 10,
1642-
"metadata": {},
1643-
"output_type": "execute_result"
1644-
}
1645-
],
1646-
"source": [
1647-
"train_csv = pd.read_csv(\"/Users/quert/Documents/GitHub/edit_NetKu/dataset/same_secs_insert_labeled/merged_train.csv\")\n",
1648-
"test_csv = pd.read_csv(\"/Users/quert/Documents/GitHub/edit_NetKu/dataset/same_secs_insert_labeled/merged_test.csv\")\n",
1649-
"val_csv = pd.read_csv(\"/Users/quert/Documents/GitHub/edit_NetKu/dataset/same_secs_insert_labeled/merged_val.csv\")\n",
1650-
"\n",
1651-
"len(train_csv), len(test_csv), len(val_csv)"
1652-
]
16531644
}
16541645
],
16551646
"metadata": {

0 commit comments

Comments
 (0)