Skip to content

Commit 4cbc900

Browse files
authored
Merge pull request #17 from arokem/arokem-092024
More edits towards finalization
2 parents e4135fa + 2f44db6 commit 4cbc900

File tree

5 files changed

+325
-98
lines changed

5 files changed

+325
-98
lines changed

references.bib

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,126 @@
1+
@ARTICLE{Hanisch2015-cu,
2+
title = "The Virtual Astronomical Observatory: Re-engineering access to
3+
astronomical data",
4+
author = "Hanisch, R J and Berriman, G B and Lazio, T J W and Emery Bunn, S
5+
and Evans, J and McGlynn, T A and Plante, R",
6+
journal = "Astron. Comput.",
7+
publisher = "Elsevier BV",
8+
volume = 11,
9+
pages = "190--209",
10+
abstract = "The US Virtual Astronomical Observatory was a software
11+
infrastructure and development project designed both to begin the
12+
establishment of an operational Virtual Observatory (VO) and to
13+
provide the US coordination with the international VO effort. The
14+
concept of the VO is to provide the means by which an astronomer
15+
is able to discover, access, and process data seamlessly,
16+
regardless of its physical location. This paper describes the
17+
origins of the VAO, including the predecessor efforts within the
18+
US National Virtual Observatory, and summarizes its main
19+
accomplishments. These accomplishments include the development of
20+
both scripting toolkits that allow scientists to incorporate VO
21+
data directly into their reduction and analysis environments and
22+
high-level science applications for data discovery, integration,
23+
analysis, and catalog cross-comparison. Working with the
24+
international community, and based on the experience from the
25+
software development, the VAO was a major contributor to
26+
international standards within the International Virtual
27+
Observatory Alliance. The VAO also demonstrated how an
28+
operational virtual observatory could be deployed, providing a
29+
robust operational environment in which VO services worldwide
30+
were routinely checked for aliveness and compliance with
31+
international standards. Finally, the VAO engaged in community
32+
outreach, developing a comprehensive web site with on-line
33+
tutorials, announcements, links to both US and internationally
34+
developed tools and services, and exhibits and hands-on training
35+
at annual meetings of the American Astronomical Society and
36+
through summer schools and community days. All digital products
37+
of the VAO Project, including software, documentation, and
38+
tutorials, are stored in a repository for community access. The
39+
enduring legacy of the VAO is an increasing expectation that new
40+
telescopes and facilities incorporate VO capabilities during the
41+
design of their data management systems.",
42+
month = jun,
43+
year = 2015,
44+
language = "en"
45+
}
46+
47+
@ARTICLE{Larobina2023-vq,
48+
title = "Thirty years of the {DICOM} standard",
49+
author = "Larobina, Michele",
50+
journal = "Tomography",
51+
publisher = "mdpi.com",
52+
volume = 9,
53+
number = 5,
54+
pages = "1829--1838",
55+
abstract = "Digital Imaging and Communications in Medicine (DICOM) is an
56+
international standard that defines a format for storing medical
57+
images and a protocol to enable and facilitate data communication
58+
among medical imaging systems. The DICOM standard has been
59+
instrumental in transforming the medical imaging world over the
60+
last three decades. Its adoption has been a significant
61+
experience for manufacturers, healthcare users, and research
62+
scientists. In this review, thirty years after introducing the
63+
standard, we discuss the innovation, advantages, and limitations
64+
of adopting the DICOM and its possible future directions.",
65+
month = oct,
66+
year = 2023,
67+
keywords = "DICOM; communication protocols; file formats; metadata;
68+
quantitative imaging",
69+
language = "en"
70+
}
71+
72+
@INPROCEEDINGS{Mustra2008-xk,
73+
title = "Overview of the {DICOM} standard",
74+
author = "Mustra, Mario and Delac, Kresimir and Grgic, Mislav",
75+
booktitle = "2008 50th International Symposium ELMAR",
76+
publisher = "IEEE",
77+
volume = 1,
78+
pages = "39--44",
79+
abstract = "Digital technology has in the last few decades entered almost
80+
every aspect of medicine. There has been a huge development in
81+
noninvasive medical imaging equipment. Because there are many
82+
medical equipment manufacturers, a standard for storage and
83+
exchange of medical images needed to be developed. DICOM (Digital
84+
Imaging and Communication in Medicine) makes medical image
85+
exchange more easy and independent of the imaging equipment
86+
manufacturer. Besides the image data, DICOM file format supports
87+
other information useful to describe the image. This makes DICOM
88+
easy to use and the data exchange fast and safe while avoiding
89+
possible confusion caused by multiple files for the same study.",
90+
month = sep,
91+
year = 2008
92+
}
93+
94+
95+
@ARTICLE{Scroggins2020-ut,
96+
title = "Once {FITS}, Always {FITS}? Astronomical Infrastructure in
97+
Transition",
98+
author = "Scroggins, Michael and Boscoe, Bernadette M",
99+
journal = "IEEE Ann. Hist. Comput.",
100+
publisher = "IEEE",
101+
volume = 42,
102+
number = 2,
103+
pages = "42--54",
104+
abstract = "The flexible interchange transport system (FITS) file format has
105+
become the de facto standard for sharing, analyzing, and
106+
archiving astronomical data over the last four decades. FITS was
107+
adopted by astronomers in the early 1980s to overcome
108+
incompatibilities between operating systems. On the back of FITS’
109+
success, astronomical data became both backward compatible and
110+
easily shareable. However, new advances in the astronomical
111+
instrumentation, computational technologies, and analytic
112+
techniques have resulted in new data that do not work well within
113+
the traditional FITS format. Tensions have arisen between the
114+
desire to update the format to meet new analytic challenges and
115+
adherence to the original edict for the FITS file format to be
116+
backward compatible. We examine three inflection points in the
117+
governance of FITS: first, initial development and success,
118+
second, widespread acceptance and governance by the working
119+
group, and third, the challenges to FITS in a new era of
120+
increasing data and computational complexity within astronomy.",
121+
year = 2020
122+
}
123+
1124

2125
@ARTICLE{Musen2022metadata,
3126
title = "Without appropriate metadata, data-sharing mandates are

sections/02-use-cases.qmd

Lines changed: 59 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,34 @@ Image Transport System) file format standard, which was developed in the late
2020
astronomy data preservation and exchange. Essentially every software platform
2121
used in astronomy reads and writes the FITS format. It was developed by
2222
observatories in the 1980s to store image data in the visible and x-ray
23-
spectrum. It has been endorsed by IAU, as well as funding agencies. Though the
24-
format has evolved over time, “once FITS, always FITS”. That is, the format
25-
cannot be evolved to introduce changes that break backward compatibility.
26-
Among the features that make FITS so durable is that it was designed originally
27-
to have a very restricted metadata schema. That is, FITS records were designed
28-
to be the lowest common denominator of word lengths in computer systems at the
29-
time. However, while FITS is compact, its ability to encode the coordinate
30-
frame and pixels, means that data from different observational instruments can
31-
be stored in this format and relationships between data from different
32-
instruments can be related, rendering manual and error-prone procedures for
33-
conforming images obsolete.
23+
spectrum. It has been endorsed by the International Astronomical Union (IAU),
24+
as well as funding agencies. Though the format has evolved over time, “once
25+
FITS, always FITS”. That is, the format cannot be evolved to introduce changes
26+
that break backward compatibility. Among the features that make FITS so durable
27+
is that it was designed originally to have a very restricted metadata schema.
28+
That is, FITS records were designed to be the lowest common denominator of word
29+
lengths in computer systems at the time. However, while FITS is compact, its
30+
ability to encode the coordinate frame and pixels, means that data from
31+
different observational instruments can be stored in this format and
32+
relationships between data from different instruments can be related, rendering
33+
manual and error-prone procedures for conforming images obsolete. Nevertheless,
34+
the stability has also raised some issues as the field continues to adapt to
35+
new measurement methods and the demands of ever-increasing data volumes and
36+
complex data analysis use-case, such as interchange with other data and the use
37+
of complex data bases to store and share data [@Scroggins2020-ut]. Another
38+
prominent example of the use of open-source processes to develop standards in
39+
Astronomy is in the tools and protocols developed by the International Virtual
40+
Observatory Alliance (IVOA) and its national implementations, e.g., in the US
41+
Virtual Astronomical Observatory[@Hanisch2015-cu]. The virtual observatories
42+
facilitate discovery and access across observatories around the world and
43+
underpin data discovery in astronomy. The IVOA took inspiration from the
44+
World-Wide Web Consortium (W3C) and adopted its process for the development of
45+
its standards (i.e., Working drafts $\rightarrow$ Proposed Recommendations
46+
$\rightarrow$ Recommendations), with individual standards developed by
47+
inter-institutional and international working groups. One of the outcomes of
48+
the coordination effort is the development of an ecosystem of software tools
49+
both developed within the observatory teams and within the user community that
50+
interoperate with the standards that were adopted by the observatories.
3451

3552
## High-energy physics (HEP)
3653

@@ -47,13 +64,38 @@ data is shared (i.e., in a standards-compliant manner).
4764

4865
## Earth sciences
4966

50-
The need for geospatial data exchange between different systems began to be recognized in the 1970s and 1980s, but proprietary formats still dominated. Coordinated standardization efforts brought the Open Geospatial Consortium (OGC) establishment in the 1990s, a critical step towards open standards for geospatial data. The 1990s have also seen the development of key standards such as the Network Common Data Form (NetCDF) developed by the University Corporation for Atmospheric Research (UCAR) and the Hierarchical Data Format (HDF), a set of file formats (HDF4, HDF5) that are widely used, particularly in climate research. The GeoTIFF format, which originated at NASA in the late 1990s, is extensively used to share image data. In the 1990s, open web mapping also began with MapServer (https://mapserver.org) and continued later with other projects such as OpenStreetMap (www.openstreetmap.org). The following two decades, the 2000s-2020s, brought an expansion of open standards and integration with web technologies developed by OGC, as well as other standards such as the Keyhole Markup Language (KML) for displaying geographic data in Earth browsers. Formats suitable for cloud computing also emerged, such as the Cloud Optimized GeoTIFF (COG), followed by Zarr and Apache Parquet for array and tabular data, respectively. In 2006, the Open Source Geospatial Foundation (OSGeo, https://www.osgeo.org) was established, demonstrating the community's commitment to the development of open-source geospatial technologies. While some standards have been developed in the industry (e.g., Keyhole Markup Language (KML) by Keyhole Inc., which Google later acquired), they later became international standards of the OGC, which now encompasses more than 450 commercial, governmental, nonprofit, and research organizations working together on the development and implementation of open standards (https://www.ogc.org).
67+
The need for geospatial data exchange between different systems began to be
68+
recognized in the 1970s and 1980s, but proprietary formats still dominated.
69+
Coordinated standardization efforts brought the Open Geospatial Consortium
70+
(OGC) establishment in the 1990s, a critical step towards open standards for
71+
geospatial data. The 1990s have also seen the development of key standards such
72+
as the Network Common Data Form (NetCDF) developed by the University
73+
Corporation for Atmospheric Research (UCAR), and the Hierarchical Data Format
74+
(HDF), a set of file formats (HDF4, HDF5) that are widely used, particularly in
75+
climate research. The GeoTIFF format, which originated at NASA in the late
76+
1990s, is extensively used to share image data. In the 1990s, open web mapping
77+
also began with MapServer (https://mapserver.org) and continued later with
78+
other projects such as OpenStreetMap (https://www.openstreetmap.org). The
79+
following two decades, the 2000s-2020s, brought an expansion of open standards
80+
and integration with web technologies developed by OGC, as well as other
81+
standards such as the Keyhole Markup Language (KML) for displaying geographic
82+
data in Earth browsers. Formats suitable for cloud computing also emerged, such
83+
as the Cloud Optimized GeoTIFF (COG), followed by Zarr and Apache Parquet for
84+
array and tabular data, respectively. In 2006, the Open Source Geospatial
85+
Foundation (OSGeo, https://www.osgeo.org) was established, demonstrating the
86+
community's commitment to the development of open-source geospatial
87+
technologies. While some standards have been developed in the industry (e.g.,
88+
Keyhole Markup Language (KML) by Keyhole Inc., which Google later acquired),
89+
they later became international standards of the OGC, which now encompasses
90+
more than 450 commercial, governmental, nonprofit, and research organizations
91+
working together on the development and implementation of open standards
92+
(https://www.ogc.org).
5193

5294
## Neuroscience
5395

54-
In contrast to astronomy and HEP, Neuroscience has traditionally been a
55-
"cottage industry", where individual labs have generated experimental data
56-
designed to answer specific experimental questions. While this model still
96+
In contrast to the previously-mentioned fields, Neuroscience has traditionally
97+
been a "cottage industry", where individual labs have generated experimental
98+
data designed to answer specific experimental questions. While this model still
5799
exists, the field has also seen the emergence of new modes of data production
58100
that focus on generating large shared datasets designed to answer many
59101
different questions, more akin to the data generated in large astronomy data
@@ -72,7 +114,7 @@ success to the adoption of OSS development mechanisms [@Poldrack2024BIDS]. For
72114
example, small changes to the standard are managed through the GitHub pull
73115
request mechanism; larger changes are managed through a BIDS Enhancement
74116
Proposal (BEP) process that is directly inspired by the Python programming
75-
language community's Python Enhancement Proposal procedure, which isused to
117+
language community's Python Enhancement Proposal procedure, which is used to
76118
introduce new ideas into the language. Though the BEP mechanism takes a
77119
slightly different technical approach, it tries to emulate the open-ended and
78120
community-driven aspects of Python development to accept contributions from a
@@ -102,3 +144,4 @@ if the standard is developed using git/GitHub for versioning, this would
102144
require learning the complex and obscure technical aspects of these system that
103145
are far from easy to adopt, even for many professional scientists.
104146

147+

sections/03-challenges.qmd

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ community, and migration away from the standard. Similarly, if a standard
3131
evolves too rapidly, users may choose to stick to an outdated version of a
3232
standard for a long time, creating strains on the community of developers and
3333
maintainers of a standard who will need to accommodate long deprecation cycles.
34+
On the other hand, in cases in which some forms of dynamic change is prohibited
35+
-- as in the case of the FITS file format, which prohibits changes that break
36+
backwards-compatibility -- there is also a cost associated with the stability
37+
[@Scroggins2020-ut]: limiting adoption and combinations of new types of
38+
measurements, new analysis methods or new modes of data storage and data
39+
sharing.
3440

3541
## Mismatches between standards developers and user communities
3642

@@ -56,6 +62,18 @@ have not yet had significant adoption as tools of day-to-day computational
5662
practice. At the same time, it provides clarity and robustness for standards
5763
developers communities that are well-versed in these tools.
5864

65+
Another layer of potential mismatches arises when a more complex set of
66+
stakeholders needs to be considered. For example, the Group on Earth
67+
Observations (GEO) is a network that aims to coordinate decision making around
68+
satellite missions and to standardize the data that results from these
69+
missions. Because this group involves a range of different stakeholders,
70+
including individuals who more closely understand potential legal issues and
71+
researchers who are better equipped to evaluate technical and domain questions,
72+
communication is slower and hindered. As the group aims to move forward by
73+
consensus, these communication difficulties can slow down progress. This is
74+
just an example, which exemplifies the many cases in which OSS process which
75+
strives for consensus can slow progress.
76+
5977

6078
## Cross-domain gaps
6179

@@ -146,6 +164,5 @@ grants (and see @sec-cross-sector). This hampers the long-term trajectory that
146164
is needed to inculcate a standard into the day-to-day practice of researchers.
147165

148166

149-
## The importance of automated validation
150167

151168

sections/04-cross-sector.qmd

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,24 @@ provide specific sources of friction. This is because proprietary/closed
9191
formats of data can create difficulty at various transition points: from one
9292
instrument vendor to another, from data producer to downstream recipient/user,
9393
etc. On the other hand, in some cases, cross-sector collaborations with
94-
commercial entities may pave the way to robust and useful standards. One
95-
example is the DICOM standard, which is maintained by working groups that
96-
encompass commercial imaging device vendors and researchers.
94+
commercial entities may pave the way to robust and useful standards. For
95+
example, imaging measurements in human subjects (e.g., in brain imaging
96+
experiments) significantly interact with standards for medical imaging, and
97+
chiefly the Digital Imaging and Communications in Medicine (DICOM) standard,
98+
which is widely used in a range of medical imaging applications, including in
99+
clinical settings [@Larobina2023-vq, @Mustra2008-xk]. The standard emerged from
100+
the demands of the clinical practice in the 1980s, as digital technologies were
101+
came into widespread use in medical imaging, through joint work of industry
102+
organizations: the American College of Radiology and the National Association
103+
of Electronic Manufacturers. One of the defining features of the DICOM standard
104+
is that it allows manufacturers of instruments to define "private fields" that
105+
are compliant with the standard, but which may include idiosyncratically
106+
organized data and/or metadata. This provides significant flexibility, but can
107+
also easily lead to the loss of important information. Nevertheless, the human
108+
brain imaging case is exemplary of a case in which industry standards and
109+
research standards coexist and need to communicate with each other effectively
110+
to advance research use-cases, while keeping up with the rapid development of
111+
the technologies.
97112

98113

99114

0 commit comments

Comments
 (0)