Skip to content

Commit 855f528

Browse files
committed
spreadthesign (takes more than a week to build)
1 parent bbdbbe0 commit 855f528

File tree

7 files changed

+172
-1
lines changed

7 files changed

+172
-1
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,5 @@ build/
1111
dist/
1212
sign_language_datasets.egg-info/
1313
.DS_Store
14+
15+
sign_language_datasets/datasets/spread_the_sign/splits/1.0.0-uzh/*.txt

sign_language_datasets/datasets/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,5 @@
2121
from .wmt_slt import WMTSLT
2222
from .asl_signs import ASLSigns
2323
from .sem_lex import SemLex
24-
from .asl_citizen import ASLCitizen
24+
from .asl_citizen import ASLCitizen
25+
from .spread_the_sign import SpreadTheSign
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""asl-signs dataset."""
2+
3+
from .spread_the_sign import SpreadTheSign

sign_language_datasets/datasets/spread_the_sign/dummy_data/TODO-add_fake_data_in_this_directory.txt

Whitespace-only changes.
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import random
2+
import sys
3+
4+
import pandas as pd
5+
6+
7+
seed = 3407
8+
csv_path = sys.argv[1]
9+
out_path = sys.argv[2]
10+
11+
def write(filename, video_ids):
12+
with open(out_path + filename, 'w') as f:
13+
for line in video_ids:
14+
f.write(f"{line}\n")
15+
16+
df = pd.read_csv(csv_path)
17+
video_ids = df.index.values.tolist()
18+
19+
write('all.txt', video_ids)
20+
21+
random.seed(seed)
22+
random.shuffle(video_ids)
23+
24+
length = len(video_ids)
25+
val_ratio = 0.001
26+
val_idx = int(length * val_ratio)
27+
test_ratio = 0.001
28+
test_idx = val_idx + int(length * test_ratio)
29+
30+
video_ids_val = video_ids[:val_idx]
31+
video_ids_test = video_ids[val_idx:test_idx]
32+
video_ids_train = video_ids[test_idx:]
33+
34+
write('train.txt', video_ids_train)
35+
write('val.txt', video_ids_val)
36+
write('test.txt', video_ids_test)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""Spreadthesign"""
2+
import csv
3+
import tarfile
4+
from os import path
5+
6+
import numpy as np
7+
import pyarrow.parquet as pq
8+
9+
import tensorflow as tf
10+
import tensorflow_datasets as tfds
11+
from tensorflow.io.gfile import GFile
12+
13+
from pose_format import Pose
14+
from pose_format import Pose, PoseHeader
15+
from pose_format.numpy import NumPyPoseBody
16+
from pose_format.pose_header import PoseHeaderDimensions
17+
from pose_format.utils.holistic import holistic_components
18+
19+
from sign_language_datasets.utils.features import PoseFeature
20+
21+
from ..warning import dataset_warning
22+
from ...datasets.config import SignDatasetConfig
23+
24+
_DESCRIPTION = """
25+
SpreadTheSign2 is a notable multilingual dictio- nary containing around 23,000 words with up to 41 different spoken-sign language pairs and more than 600,000 videos in total.
26+
"""
27+
28+
_CITATION = """
29+
"""
30+
31+
_POSE_HEADERS = {"holistic": path.join(path.dirname(path.realpath(__file__)), "holistic.poseheader")}
32+
33+
_KNOWN_SPLITS = {
34+
"1.0.0-uzh": path.join(path.dirname(path.realpath(__file__)), "splits/1.0.0-uzh"),
35+
}
36+
37+
38+
class SpreadTheSign(tfds.core.GeneratorBasedBuilder):
39+
"""DatasetBuilder for Spreadthesign dataset."""
40+
41+
VERSION = tfds.core.Version("1.0.0")
42+
RELEASE_NOTES = {
43+
"1.0.0": "Initial release.",
44+
}
45+
46+
BUILDER_CONFIGS = [
47+
SignDatasetConfig(name="default", include_pose='holistic'),
48+
]
49+
50+
def _info(self) -> tfds.core.DatasetInfo:
51+
"""Returns the dataset metadata."""
52+
53+
features = {
54+
"id": tfds.features.Text(),
55+
"text": tfds.features.Text(),
56+
"sign_language": tfds.features.Text(),
57+
"spoken_language": tfds.features.Text(),
58+
"pose_path": tfds.features.Text(),
59+
"pose_length": tf.float32,
60+
}
61+
62+
return tfds.core.DatasetInfo(
63+
builder=self,
64+
description=_DESCRIPTION,
65+
features=tfds.features.FeaturesDict(features),
66+
homepage="https://www.spreadthesign.com/",
67+
supervised_keys=None,
68+
citation=_CITATION,
69+
)
70+
71+
def _load_split_ids(self, split: str):
72+
split_dir = _KNOWN_SPLITS[self._builder_config.extra['split']]
73+
74+
with open(path.join(split_dir, f'{split}.txt')) as f:
75+
ids = []
76+
for line in f:
77+
id = line.rstrip('\n')
78+
ids.append(id)
79+
80+
return ids
81+
82+
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
83+
"""Returns SplitGenerators."""
84+
dataset_warning(self)
85+
86+
pose_dir = self._builder_config.extra['pose_dir']
87+
88+
if 'split' in self._builder_config.extra:
89+
train_args = {"pose_dir": pose_dir, "ids": self._load_split_ids('train')}
90+
val_args = {"pose_dir": pose_dir, "ids": self._load_split_ids('val')}
91+
test_args = {"pose_dir": pose_dir, "ids": self._load_split_ids('test')}
92+
93+
return [
94+
tfds.core.SplitGenerator(name=tfds.Split.TRAIN, gen_kwargs=train_args),
95+
tfds.core.SplitGenerator(name=tfds.Split.VALIDATION, gen_kwargs=val_args),
96+
tfds.core.SplitGenerator(name=tfds.Split.TEST, gen_kwargs=test_args),
97+
]
98+
else:
99+
return [tfds.core.SplitGenerator(name=tfds.Split.TRAIN, gen_kwargs={"pose_dir": pose_dir})]
100+
101+
def _generate_examples(self, pose_dir: str, ids: list = []):
102+
""" Yields examples. """
103+
104+
with GFile(self._builder_config.extra['csv_path'], "r") as csv_file:
105+
csv_data = csv.reader(csv_file, delimiter=",")
106+
next(csv_data) # Ignore the header
107+
108+
for i, row in enumerate(csv_data):
109+
datum = {
110+
"id": str(i),
111+
"text": row[3],
112+
"sign_language": row[1],
113+
"spoken_language": row[2],
114+
}
115+
116+
if len(ids) > 0 and (datum["id"] not in ids):
117+
continue
118+
119+
if self.builder_config.include_pose is not None:
120+
if self.builder_config.include_pose == "holistic":
121+
mediapipe_path = path.join(pose_dir, row[0])
122+
123+
if path.exists(mediapipe_path):
124+
datum["pose_path"] = mediapipe_path
125+
with open(mediapipe_path, "rb") as f:
126+
pose = Pose.read(f.read())
127+
datum["pose_length"] = pose.body.data.shape[0]
128+
129+
yield datum['id'], datum

0 commit comments

Comments
 (0)