1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
|
\documentclass[10.5pt]{article}
%% This is a convenience variable if you are using PGFPlots to build plots
%% within LaTeX. If you want to import PDF files for figures directly, you
%% can use the standard `\includegraphics' command. See the definition of
%% `\includetikz' in `tex/preamble-pgfplots.tex' for where the files are
%% assumed to be if you use `\includetikz' when `\makepdf' is not defined.
\newcommand{\makepdf}{}
%% When defined (value is irrelevant), `\highlightchanges' will cause text
%% in `\tonote' and `\new' to become colored. This is useful in cases that
%% you need to distribute drafts that is undergoing revision and you want
%% to highlight to your colleagues which parts are new and which parts are
%% only for discussion.
%\newcommand{\highlightchanges}{}
%% Import the necessary preambles.
\input{tex/src/preamble-style.tex}
\input{tex/build/macros/project.tex}
\input{tex/src/preamble-pgfplots.tex}
\input{tex/src/preamble-biblatex.tex}
\title{Towards long-term archivable reproducibility}
\author{\large\mpregular \authoraffil{Mohammad Akhlaghi}{1,2,3},
\large\mpregular \authoraffil{Ra\'ul Infante-Sainz}{1,2},
\large\mpregular \authoraffil{Boudewijn F. Roukema}{4,3},
\large\mpregular \authoraffil{David Valls-Gabaud}{5},
\large\mpregular \authoraffil{Roberto Baena-Gall\'e}{1,2}\\
{
\footnotesize\mplight
\textsuperscript{1} Instituto de Astrof\'isica de Canarias, Calle V\'ia L\'actea s/n, 38205 La Laguna, Tenerife, Spain.\\
\textsuperscript{2} Departamento de Astrof\'isica, Universidad de La Laguna, Avenida Astrof\'isico Francisco S\'anchez s/n, 38200 La Laguna, Tenerife, Spain.\\
\textsuperscript{3} Univ Lyon, Ens de Lyon, Univ Lyon1, CNRS, Centre de Recherche Astrophysique de Lyon UMR5574, F-69007, Lyon, France.\\
\textsuperscript{4} Institute of Astronomy, Faculty of Physics, Astronomy and Informatics, Nicolaus Copernicus University, Grudziadzka 5, 87-100 Toru\'n, Poland.\\
\textsuperscript{5} LERMA, CNRS, Observaoire de Paris, 61 Avenue de l'Observatoire, 75014 Paris, France.\\
Corresponding author: Mohammad Akhlaghi
(\href{mailto:mohammad@akhlaghi.org}{\textcolor{black}{mohammad@akhlaghi.org}})
}}
\date{}
\begin{document}%\layout
\thispagestyle{firstpage}
\maketitle
%% Abstract % max 250 words for CiSE
{\noindent\mpregular
%% CONTEXT
Many reproducible workflow solutions have been proposed over recent decades.
Most use the high-level technologies that were popular when they were created, providing an immediate solution that is unlikely to be sustainable in the long term.
Decades later, scientists lack the resources to rewrite their projects, while still being accountable for their results.
This creates generational gaps, which, together with technological obsolescence, impede reproducibility and building upon previous work.
%% AIM
We aim to introduce a set of criteria to address this problem and to demonstrate their practicality.
%% METHOD
The criteria have been tested in several research publications and can be summarized as: completeness (no dependency beyond a POSIX-compatible operating system, no administrator privileges, no network connection and storage primarily in plain-text); modular design; linking analysis with narrative; temporal provenance; scalability; and free-and-open-source software.
%% RESULTS
Through an implementation, called "Maneage" (managing+lineage), we find that storing the project in machine-actionable and human-readable plain-text, enables version-control, cheap archiving, automatic parsing to extract data provenance, and peer-reviewable verification.
Furthermore, we show that these criteria are not limited to long-term reproducibility but also provide immediate, fast short-term reproducibility.
%% CONCLUSION
We conclude that requiring longevity from solutions is realistic.
We discuss the benefits of these criteria for scientific progress.
\horizontalline
\noindent
{\mpbold Keywords:} Data Lineage, Data Provenance, Reproducibility, Scientific Pipelines, Workflows
% \noindent
% {\mpbold Note to DSJ editors or referees:} The distributed source of this project (described in Section \ref{sec:publishing}) is available in this URL: \url{https://akhlaghi.org/dsj-paper-\projectversion.tar.gz}
}
\horizontalline
\section{Introduction}
\label{sec:introduction}
The increasing volume and complexity of data analysis has been extraordinarily productive, giving rise to a new branch of ``Big Data'' in many fields of the sciences and industry.
However, given its inherent complexity, the results are barely useful alone, questions naturally arise on their lineage or provenance:
What inputs were used?
How were the configurations or training data chosen?
What operations were done on those inputs, how were the plots made?
See Figure \ref{fig:questions} for some similar questions, classified by their place in project.
\tonote{Johan: add some general references.}
Due to the complexity of modern data analysis, a small deviation in the final result can be due to many different steps, which may be significant for its interpretation.
Integrity checks are a critical component of the scientific method, but are only possible with access to the data \emph{and} its lineage (workflows).
For example, \citet{smart18} describes how a 7-year old conflict in theoretical condensed matter physics was only identified after the relative codes were shared.
\citet{miller06} found a mistaken column flipping in a project's workflow, leading to the retraction of 5 papers in major journals, including \emph{Science}.
\citet{baggerly09} highlighted the inadequate narrative description of the analysis and showed the prevalence of simple errors in published results, ultimately calling their work ``\emph{forensic bioinformatics}''.
\citet{herndon14} and \citet[a self-correction]{horvath15} also reported similar situations and \citet{ziemann16} concluded that one-fifth of papers contain erroneous gene name conversions.
These are mostly from genomics and bioinformatics because publishing workflows is commonly practiced already (for example \href{https://www.myexperiment.org}{myexperiment.org}, \href{https://www.genepattern.org}{genepattern.org}, and \href{https://galaxyproject.org}{galaxy\-project.org}).
The status in other fields, without a culture of publishing workflows, is highly likely to be worse.
Nature is already a black box which we are trying hard to unlock.
Not being able to experiment on the methods of other researchers is a self-imposed back box over it.
\begin{figure}[t]
\begin{center}
\includetikz{figure-project-outline}
\end{center}
\vspace{-17mm}
\caption{\label{fig:questions}Graph of a generic project's workflow (connected through arrows), highlighting the various issues/questions on each step.
The green boxes with sharp edges are inputs and the blue boxes with rounded corners are intermediate or final outputs.
The red boxes with dashed edges highlight the main questions at each respective stage.
The box covering software download and build phases shows some common tools software developers use for this phase, but a scientific project is clearly much more involved.
}
\end{figure}
The completeness of a project's published lineage (usually within the ``Methods'' section) can be measured by the ability to reproduce the result.
Several studies have attempted to answer this with different levels of detail.
For example, \citet{allen18} found that roughly half of the papers in astrophysics do not even mention the names of any analysis software, while \citet{menke20} found this fraction has greatly improved in medical/biological field and is currently above $80\%$.
\citet{ioannidis2009} attempted to reproduce 18 published results by two independent groups, but fully succeeded in only 2 of them and partially in 6.
\citet{chang15} attempted to reproduce 67 papers in well-regarded Economics journals that published data and code: only 22 could be reproduced without contacting authors, and more than half could not be replicated at all. \tonote{DVG: even after contacting the authors?}
\citet{stodden18} attempted it in 204 scientific papers published in the journal \emph{Science} \emph{after} adoptiong a policy of publishing the data and code associated with the papers.
Even though the authors were contacted, the success rate was an abysmal $26\%$.
Overall, this problem is unambiguously assessed as being very serious in the community: \citet{baker16} surveyed 1574 researchers and found that only $3\%$ did not see a ``\emph{reproducibility crisis}''.
Yet, this is not a new problem in the sciences: back in 2011, Elsevier conducted an ``\emph{Executable Paper Grand Challenge}'' \citep{gabriel11} and the proposed solutions were published in a special edition.\tonote{DVG: which were the results?}
Even before that, in an attempt to simulate research projects, \citet{ioannidis05} proved that ``\emph{most claimed research findings are false}''.
In the 1990s, \citet{buckheit1995, claerbout1992} described the same problem very eloquently and also provided some solutions they used.\tonote{DVG: more details here, one is left wondering ...}
Even earlier, through his famous quartet, \citet{anscombe73} qualitatively showed how distancing of researchers from the intricacies of algorithms/methods can lead to misinterpretation of the results.
One of the earliest such efforts we are aware of is the work of \citet{roberts69}, who discussed conventions in Fortran programming and documentation to help in publishing research codes.
While the situation has somewhat improved, all these papers still resonate strongly with the frustrations of today's scientists.
To address the collective problem of preserving a project's data lineage as well as its software dependencies, we introduce Maneage (Maneage+Lineage), pronounced man-ee-ij or \textipa{[m{\ae}n}i{\textsci}d{\textyogh}], hosted at \url{http://maneage.org}.
A project using Maneage starts by branching from its main Git branch, allowing the authors to customize it: specifying the necessary software tools for that particular project, adding analysis steps and adding visualizations and a narrative based on the results.
In Section \ref{sec:principles} the founding principles behind Maneage are discussed.
Section \ref{sec:maneage} describes the internal structure of Maneage and Section \ref{sec:discussion} is a discussion on its benefits, caveats and future prospects.
\section{Principles}
\label{sec:principles}
The core principle of Maneage is simple: science is defined primarily by its method, not its result.
As \citet{buckheit1995} describe it, modern scientific papers are merely advertisements of scholarship, while the actual scholarship is the coding behind the plots/results.
Many solutions have been proposed in the last decades, including (but not limited to)
1992: \href{https://sep.stanford.edu/doku.php?id=sep:research:reproducible}{RED},
2003: \href{https://taverna.incubator.apache.org}{Apache Taverna},
2004: \href{https://www.genepattern.org}{GenePattern},
2010: \href{https://wings-workflows.org}{WINGS},
2011: \href{https://www.ipol.im}{Image Processing On Line journal} (IPOL),
\href{https://www.activepapers.org}{Active papers},
\href{https://is.ieis.tue.nl/staff/pvgorp/share}{SHARE},
2015: \href{https://sciunit.run}{Sciunit};
2017: \href{https://falsifiable.us}{Popper};
2019: \href{https://wholetale.org}{WholeTale}.
To help in the comparison, the founding principles of Maneage are listed below.
\begin{enumerate}[label={\bf P\arabic*}]
\item \label{principle:complete}\textbf{Completeness:}
A project that is complete, or self-contained,
(P1.1) has no dependency beyond the Port\-able Operating System (OS) Interface, or POSIX, or a minimal Unix-like environment.
A consequence of this is that the project itself must be stored in plain-text: not needing any specialized software to open, parse or execute.
(P1.2) does not affect the host,
(P1.3) does not require root, or administrator, privileges,
(P1.4) builds its software for an independent environment,
(P1.5) can be run locally (without internet connection),
(P1.6) contains the full project's analysis, visualization \emph{and} narrative, from access to raw inputs to producing final published format (e.g., PDF or HTML),
(P1.7) requires no manual/human interaction and can run automatically \citep[according to][``\emph{a clerk can do it}'']{claerbout1992}.
\emph{Comparison with existing:} with many dependencies beyond POSIX, except for IPOL, none of the tools above are complete.
For example, the workflow of most recent solutions need Python or Jupyter notebooks.
Because of their complexity (see \ref{principle:complexity}), pre-built binary blobs like containers or virtual machines are the chosen storage format, which are large (Giga-bytes) and expensive to archive.
Furthermore, third-party package managers setup the environment, like Conda, or the OS's, like \inlinecode{apt} or \inlinecode{yum}.
However, exact versions of \emph{every software} are rarely included, and the servers remove old binaries, hence blobs are hard to recreate.
Blobs also have a short lifespan, e.g., Docker containers made today, may not be operable with future versions of Docker or Linux (currently Linux 3.2.x is the earliest supported version, released in 2012).
In general they mostly aim for short-term reproducibility.
A plain-text project is readable by humans and machines (even if it can't be executed) and consumes no less than a megabyte.
\item \label{principle:modularity}\textbf{Modularity:}
A project should be compartmentalized into independent modules with well-defined inputs/outputs having no side effects.
Communication between the independent modules should be explicit, providing several optimizations:
(1) independent modules can run in parallel.
Modules that do not need to be run (because their dependencies have not changed) will not be re-run.
(2) Data provenance extraction (recording any dataset's origins).
(3) Citation: others can credit specific parts of a project.
(4) Usage in other projects.
(5) Most importantly: they are easy to debug and improve.
\emph{Comparison with existing:} Visual workflow tools like Apache Taverna, GenePattern, Kepler or VisTrails encourage this, but the more recent tools (mostly written in Python) leave this to project authors.
However, designing a modular project needs to be encouraged and facilitated.
Otherwise, scientists, who are not usually trained in data management, will rarely design a modular project, leading to great inefficiencies in terms of project cost and/or scientific accuracy (testing/validating will be expensive).
\item \label{principle:complexity}\textbf{Minimal complexity:}
This is Ockham's razor extrapolated to project management \citep[``\emph{Never posit pluralities without necessity}''][]{schaffer15}:
1) avoid complex relations between analysis steps (related to \ref{principle:modularity}).
2) avoid the programming language that is currently in vogue, because it is going to fall out of fashion soon and require significant resources to translate or rewrite it every few years (to stay fashionable).
The same job can be done with more stable/basic tools, requiring less long-term effort.
\emph{Comparison with existing:} IPOL stands out here too (requiring only ISO C), however most others are written in Python, and use Conda or Jupyter (see \ref{principle:complete}).
Besides being incomplete (\ref{principle:complete}), these tools have short lifespans and evolve fast (e.g., Python 2 code cannot run with Python 3, causing disruption in many projects).
Their complex dependency trees also making them hard to maintain, for example, see the dependency tree of Matlplotlib in \citet[][Figure 1]{alliez19}, its one of the simpler Jupyter dependencies.
The longevity of a workflow is determined by its shortest-lived dependency.
\item \label{principle:verify}\textbf{Verifiable inputs and outputs:}
The project should automatically verify its inputs (software source code and data) \emph{and} outputs, not needing expert knowledge to confirm a reproduction.
\emph{Comparison with existing:} Such verification is usually possible in most systems, but as a responsibility of the project authors.
As with \ref{principle:modularity}, due to lack of training, if not actively encouraged and facilitated, it will not be implemented.
\item \label{principle:history}\textbf{History and temporal provenance:}
No project is done in a single/first attempt.
Projects evolve as they are being completed.
It is natural that earlier phases of a project are redesigned/optimized only after later phases have been completed.
This is often seen in exploratory research papers, with statements like ``\emph{we [first] tried method [or parameter] X, but Y is used here because it gave lower random error}''.
A project's ``history'' is thus as scientifically relevant as the final, or published version.
\emph{Comparison with existing:} The solutions above that implement version control usually support this principle.
However, because the systems as a whole are rarely complete (see \ref{principle:complete}), their histories are also incomplete.
IPOL fails here, because only the final snapshot is published.
\item \label{principle:scalable}\textbf{Scalability:}
A project should be scalable to arbitrarily large and/or complex projects.
\emph{Comparison with existing:}
Most of the more recent solutions above are scalable.
However, IPOL, which uniquely stands out in satisfying most principles, fails here: IPOL is devoted to low-level image processing algorithms that \emph{can be} done with no dependencies beyond an ISO C compiler.
IPOL is thus not scalable to large projects, which commonly involve dozens of high-level dependencies, with complex data formats and analysis.
\item \label{principle:freesoftware}\textbf{Free and open source software:}
Technically, reproducibility \citet{fineberg19} is possible with non-free or non-open-source software (a black box).
This principle is thus necessary to complement that definition (nature is already a black box, we don't need another one):
(1) As a free software, others can learn from, modify, and build upon it.
(2) The lineage can be traced to free software's implemented algorithms, also enabling optimizations on that level.
(3) A free-software package that does not execute on particular hardware can be modified to work on it.
(4) A non-free software project typically cannot be distributed by others, making the whole community reliant on the owner's server (even if the owner does not ask for payments).
\emph{Comparison with existing:} The existing solutions listed above are all free software.
Based on this principle, we do not consider non-free solutions.
\end{enumerate}
\section{Maneage}
\label{sec:maneage}
Maneage is an implementation of the principles of Section \ref{sec:principles}.
In practice, Maneage is a collection of plain-text files that are distributed in pre-defined sub-directories by context (a modular source), and are all under version control, currently with Git.
The main Maneage Branch is a fully working skeleton of a project without much flesh: it contains all the low-level infrastructure, but without any actual high-level analysis operations.
Maneage contains a file called \inlinecode{README-hacking.md} (the \inlinecode{README.md} file is reserved for the project using Maneage, not Maneage itself) that has a complete checklist of steps to start a new project and remove demonstration parts.
There are also hands-on tutorials to help new users.
To start a new project, the authors \emph{clone} Maneage, create a branch, and start their project by customizing it.
Thus, projects start with a good data management strategy rather than imposing it in the end, as recommended by \citet{fineberg19}.
Customization is done by adding the names of the necessary software, references to input data, analysis and visualization commands and writting a narrative description.
This will be done in multiple commits during the project (perhaps years), preserving the project's history: the descriptions of, and motivations for, changes or test failures and successes, as well as the authors and timestamps of each change.
\begin{lstlisting}[language=bash]
git clone https://git.maneage.org/project.git # Clone Maneage, default branch `maneage'.
mv project my-project && cd my-project # Set custom name and enter directory.
git remote rename origin origin-maneage # Rename remote server to use `origin' later.
git checkout -b master # Make new `master' branch, start customizing.
\end{lstlisting}
Maneage has two main phases: (1) configuration, where the necessary software is built and the environment is set up, and (2) analysis, where data are accessed and the software is run to create the final visualizations and report:
\begin{lstlisting}[language=bash]
./project configure # Build all necessary software from source.
./project make # Do the analysis (download data, run software on data, build PDF).
\end{lstlisting}
Section \ref{sec:usingmake} elaborates why Make was chosen as the main job manager.
Sections \ref{sec:projectconfigure} \& \ref{sec:projectanalysis} are on the operations done during the configuration and analysis phase.
The benefit from version control is described in Section \ref{sec:projectgit}.
Section \ref{sec:collaborating} discusses the sharing of a built environment, and finally, Section \ref{sec:publishing} is about the publication, or archival, of Maneage projects.
\subsection{Job orchestration with Make}
\label{sec:usingmake}
Scripts (e.g. shell, Perl, or Python) are an obvious solution for non-interactive (batch) processing.
However, the inherent complexity and non-linearity of progress as a project evolves makes it hard to manage scripts.
For example, if $90\%$ of a research project is done and only the final $10\%$ must be executed, a script will re-do the whole project.
Completed parts can be manually ignored (with conditionals), but this adds to the complexity and discourages experimentation on already completed parts.
These problems motivated the creation of Make in the early Unix OS \citep{feldman79}.
Make contiues to be a core component of modern OSs, is actively maintained, and has withstood the test of time.
The Make paradigm starts from the end: the final \emph{target}.
In Make, the project is broken into atomic \emph{rules}, where each rule has a single \emph{target} file which can depend on any number of \emph{prerequisite} files.
To build the target from the prerequisites, each rule also has a \emph{recipe} (an atomic script).
The plain-text files containing Make source code are called Makefiles.
Make does not replace scripting languages like the shell, Python or R.
It is a higher-level structure enabling modular/atomic scripts (in any language) to be put into a workflow.
Besides formalizing a project's data lineage, Make also greatly encourages experimentation in a project, because a recipe is executed only when at least one prerequisite file is more recent than its target.
For example, when only $5\%$ of a project's targets are affected by a change, the other $95\%$ remain dormant.
Furthermore, Make first examines the full lineage before starting the execution of recipes, and it can thus execute independent rules in parallel, improving speed and encouraging experimentation.
Make is well known by many outside of software development communities.
For example, geophysics students have easily adopted it for the RED project management tool \citep{schwab2000}.
We also received very good feedback on the simplicity of using Make from early adopters of Maneage, especially graduate students and postdocs.
\subsection{Project configuration}
\label{sec:projectconfigure}
Maneage organizes both the building of its software and the analysis pipeline using Make (see Section \ref{sec:usingmake}).
Thus, a researcher using Maneage for high-level analysis easily understands and can customize the software environment without needing to learn third-party tools.
The existing tools listed in Section \ref{sec:principles} mostly use package managers like Conda to maintain the software environment, but Conda itself is written in Python, contrary to our completeness principle \ref{principle:complete}.
Highly-robust solutions like Nix and GNU Guix exist, but these require root permissions, contrary to principle P1.3.
Project configuration (building the software environment) is managed by the files under \inlinecode{reproduce\-/soft\-ware} of Maneage's source.
At the start of project configuration, Maneage needs a top-level directory to build itself on the host (software and analysis).
We call this the ``build directory'' and it must not be located inside the source directory (see \ref{principle:modularity}).
No other location on the running OS will be affected by the project, including the source directory.
Two other local directories can optionally be specified by the project when inputs are present locally: 1) software tarball directory and 2) input data directory.
Sections \ref{sec:buildsoftware} and \ref{sec:softwarecitation} detail the building of the required software and the important issue of software citation.
\subsubsection{Verifying and building necessary software from source}
\label{sec:buildsoftware}
To compile the necessary software from source, Maneage currently needs the host to have a C and C++ compiler (available on any POSIX-compliant OS).
Maneage will build and install (in the build directory) all necessary software and their dependencies, all with fixed versions and configurations.
The dependency tree continues down to core OS components including GNU Bash, GNU AWK, GNU Coreutils on all supported OSs.
On GNU/Linux OSs, a fixed version of the GNU Binutils and GNU C Compiler (GCC) is also built, soon a custom GNU C Library will also be included to be fully independent of the host (task 15390).
Except for very low level components like the Kernel or filesystem, Maneage thus builds all other components necessary for the project.
Because there is no pure/raw POSIX OS, Maneage aims to run on existing POSIX-compatible OSs, failure to build on anyone of them is treated as a bug, which will be fixed.
It is currently being actively tested on GNU/Linux and macOS.
A Maneage project can be configured in a container or virtual machine to facilitate moving the project without rebuilding everything from source, or to use it on non-compatible OSs.
However, such binary blobs are not the primary storage/archival format of Maneage.
Before building the software, their source codes are validated by their SHA-512 checksum (stored in the project).
Maneage includes a growing collection of scientific software (and its dependencies), much of which is superfluous for any single project.
Therefore, each project has to identify its high-level software in the \inlinecode{TARGETS.conf} file.
\subsubsection{Software citation}
\label{sec:softwarecitation}
Maneage contains the full list of software that were built for the project but this information is buried deep into the source.
Maneage prints a simplified description of this information in the project's final report, blended into the narrative, as in the Acknowledgments of this paper.
Furthermore, when the software is associated with a published paper, that paper's Bib\TeX{} entry is added to the final report and is duly cited with the software's name and version.
This paper uses basic software without associated scientific papers. For software citation examples, see \citet{akhlaghi19} and \citet{infante20}.
This is particularly important for research software, where citation is critical to justify continued development.
A notable example is GNU Parallel \citep{tange18} which prints citation information each time it is run, proposing to either cite the paper or support it with 10000 euros.
It provides a \inlinecode{--citation} option to disable the notice.
In \href{https://git.savannah.gnu.org/cgit/parallel.git/tree/doc/citation-notice-faq.txt?h=master}{its FAQ} this is justified by ``\emph{If you feel the benefit from using GNU Parallel is too small to warrant a citation, then prove that by simply using another tool}''.
Most software does not resort to such drastic measures. However, proper citation is not only useful practically, it is also an ethical imperative.
Given the increasing role of software in research \citep{clement19}, automatic citation, is a robust solution.
For a review of the necessity and basic elements of software citation, see \citet{katz14} and \citet{smith16}.
The CodeMeta and Citation file format (CFF) aim to expand software citation beyond Bib\TeX, while Software Heritage \citep{dicosmo18} also includes archival and citation abilities.
These will be tested and enabled in Maneage.
\subsection{Project analysis}
\label{sec:projectanalysis}
The analysis operations run with no influence from the host OS, enabling an isolated environment without the extra layer of containers or a virtual machine.
In Maneage, a project's analysis is broken into two phases: 1) preparation, and 2) analysis.
Both have an identical internal structure.
The preparation phase is usually only necessary for advanced users who need to optimize extremely large datasets.
The analysis phase consists of many steps, including data access (possibly by downloading), running various steps of the analysis on the raw inputs, and creating the necessary figures or tables for a published report, or output datasets for a database.
If all of these steps were organized in a single Makefile, it would become very long, and would be hard to maintain, extend, read, reuse, and cite.
Large files are in general a bad practice and against the modularity and minimal complexity principles (\ref{principle:modularity} \& \ref{principle:complexity}).
Maneage is thus designed to encourage and facilitate modularity by distributing the analysis into many Makefiles that contain contextually-similar analysis steps.
Hereafter, these lower-level Makefiles are termed \emph{subMakefiles}.
When run with the \inlinecode{make} argument, the \inlinecode{project} script (Section \ref{sec:maneage}), calls \inlinecode{top-make.mk}, which loads the subMakefiles using the \inlinecode{include} directive (see Section \ref{sec:analysis}).
All the analysis Makefiles are in \inlinecode{re\-produce\-/anal\-ysis\-/make}. Figure \ref{fig:datalineage} shows their relationship with the target/built files that they manage.
To keep the project's logic clear and simple (minimal complexity principle, \ref{principle:complexity}), recursion (where one instance of Make calls Make internally) is, by default, not used.
\begin{figure}[t]
\begin{center}
\includetikz{figure-data-lineage}
\end{center}
\vspace{-7mm}
\caption{\label{fig:datalineage}Schematic representation of a project's data lineage, or workflow, for the demonstration analysis of this paper.
Each colored box is a file in the project and the arrows show the dependencies between them.
Green files/boxes are plain-text files that are under version control and in the source directory.
Blue files/boxes are output files in the build-directory, shown within the Makefile (\inlinecode{*.mk}) where they are defined as a \emph{target}.
For example, \inlinecode{paper.pdf} depends on \inlinecode{project.tex} (in the build directory; generated automatically) and \inlinecode{paper.tex} (in the source directory; written manually).
The solid arrows and full-opacity built boxes are described in Section \ref{sec:projectanalysis}.
The dashed arrows and low-opacity built boxes show the scalability by adding hypothetical steps to the project.
}
\end{figure}
To avoid getting too abstract in the subsections below, where necessary we will do a basic analysis on the data of \citet{menke20} (hereafter M20) and replicate one of the results.
We cannot use the same software as M20, because M20 used Microsoft Excel for their analysis, violating several of our principles: \ref{principle:complete}, \ref{principle:complexity} and \ref{principle:freesoftware}.
Since we do not use the same software, this does not qualify as a reproduction \citep{fineberg19}.
In the subsections below, this paper's analysis on that dataset is described using the data lineage graph of Figure \ref{fig:datalineage}.
We will follow Make's paradigm (see Section \ref{sec:usingmake}) of starting the lineage backwards form the ultimate target in Section \ref{sec:paperpdf} (bottom of Figure \ref{fig:datalineage}) to the configuration files \ref{sec:configfiles} (top of Figure \ref{fig:datalineage}).
To better understand this project, we recommend study of this paper's own Maneage source, published as a supplement.
\subsubsection{Ultimate target: the project's paper or report (\inlinecode{paper.pdf})}
\label{sec:paperpdf}
The ultimate purpose of a project is to report the data analysis result, as raw visualizations, or numbers blended in with a narrative.
In Figure \ref{fig:datalineage}, this is \inlinecode{paper.pdf}, which is the only built file (blue box) with no outwards arrows leaving it.
The instructions to build \inlinecode{paper.pdf} are in the \inlinecode{paper.mk} subMakefile.
Its prerequisites include \inlinecode{paper.tex} and \inlinecode{references.tex} (Bib\TeX{} entries for possible citations) in the project source and \inlinecode{project.tex} which is a built product.
The high-level connections of this project with previous projects are formalized by \inlinecode{references.tex}.
\subsubsection{Values within text (\inlinecode{project.tex})}
\label{sec:valuesintext}
Figures, plots, tables, datasets, and/or narrative are not the only outputs of a project.
In many cases, quantitative values from the analysis are also blended into the sentences of the report's narration, or published with the dataset in a database.
An example is in the abstract of \citet[\href{https://doi.org/10.5281/zenodo.3408481}{zenodo.3408481}, written in Maneage]{akhlaghi19}: ``\emph{... detect the outer wings of M51 down to S/N of 0.25 ...}''.
The value `0.25', for the signal-to-noise ratio (S/N), also depends on the analysis, and is thus also an output.
Manually typing such numbers in the narrative is prone to errors and discourages experimentation.
To automatically generate and blend them in the text, Maneage uses \LaTeX{} macros.
For example, the \LaTeX{} source of the quote above is: ``\inlinecode{\small detect the outer wings of M51 down to S/N of \$\textbackslash{}demo\-sf\-optimized\-sn\$}''.
The ma\-cro ``\inlinecode{\small\textbackslash{}demosfoptimizedsn}'' is automatically created during the project. It expands to the value ``\inlinecode{0.25}'' when the PDF output is built.
All such values are referenced in \inlinecode{project.tex}.
However, managing them in a single file would violate the modularity principle, be hard to parallelize, frustrating to manage, and bug-prone.
All subMakefiles thus contain a fixed target with the same name but a different suffix: \inlinecode{.tex} instead of \inlinecode{.mk}, hosting values generated in that subMakefile.
Figure \ref{fig:datalineage} shows them as built products of every subMakefile, except for \inlinecode{paper.mk}.
These \LaTeX{} macro files form the core skeleton of a Maneage project: as shown in Figure \ref{fig:datalineage}, the outward arrows of all built files in any subMakefile ultimately lead to one of these \LaTeX{} macro files.
\subsubsection{Verification of outputs (\inlinecode{verify.mk})}
\label{sec:outputverification}
Before the modular \LaTeX{} macro files described above are merged into the single \inlinecode{project.tex} file, they need to pass through the verification filter, which implements another core principle of Maneage, \ref{principle:verify}.
Confirming the checksum of the final PDF or of figures and datasets is not generally useful because many tools write the creation date into the files.
To avoid this, the raw data must be verified independent of metadata like date.
Some standards include such date-independent verification features, for example, the \inlinecode{DATASUM} keyword in the FITS format \citep{pence10}.
To facilitate output verification, Maneage has the \inlinecode{verify.mk} subMakefile that separates the analytical phase of the paper from the production of the report (see Figure \ref{fig:datalineage}).
This file implements some tests on pre-defined formats.
Other formats can easily be added.
\subsubsection{The analysis}
\label{sec:analysis}
The analysis is demonstrated with the practical example of replicating Figure 1C of M20, with some enhancements, in Figure \ref{fig:toolsperyear}.
As shown in Figure \ref{fig:datalineage}, for this example we split this goal into two subMakefiles: \inlinecode{format.mk} and \inlinecode{demo-plot.mk}.
The former converts the Excel-formatted input into comma-separated value (CSV) format, and the latter generates the table to build Figure \ref{fig:toolsperyear}.
In a real project, subMakefiles could, and will, be much more complex.
Their location after the standard starting subMakefiles (initialization and download) and before the standard ending subMakefiles (verification and final paper) is important, along with their order.
\begin{figure}[t]
\begin{center}
\includetikz{figure-tools-per-year}
\end{center}
\vspace{-5mm}
\caption{\label{fig:toolsperyear}Ratio of papers mentioning software tools (green line, left vertical axis) to total number of papers studied in that year (light red bars, right vertical axis in log-scale).
This is an enhanced replica of figure 1C \citet{menke20}, shown here for demonstrating Maneage, see Figure \ref{fig:datalineage} for its lineage and Section \ref{sec:analysis} for how it was organized.
}
\end{figure}
To enhance the original M20 plot, Figure \ref{fig:toolsperyear} also shows the number of papers in each year and its horizontal axis shows the full range of the data (starting from \menkefirstyear), while M20 starts from 1997.
This was probably because the authors judged the earlier years' data to be too noisy. For example, in \menkenumpapersdemoyear, only \menkenumpapersdemocount{} papers were analysed.
Both the numbers in the previous sentence (\menkenumpapersdemoyear{} and \menkenumpapersdemocount), and the dataset's oldest year (mentioned above: \menkefirstyear) are automatically generated \LaTeX{} macros, see Section \ref{sec:valuesintext}.
These are \emph{not} typeset manually in this narrative explanation.
This step (generating the macros) is shown schematically in Figure \ref{fig:datalineage} with the arrow from \inlinecode{tools-per-year.txt} to \inlinecode{demo-plot.tex}.
To create Figure \ref{fig:toolsperyear}, we used the PGFPlots package within \LaTeX{}.
Therefore, the necessary analysis output to feed into \LaTeX{} was a plain-text table with 3 columns (year, paper per year, tool fraction per year).
This table is shown in the lineage graph of Figure \ref{fig:datalineage} as \inlinecode{tools-per-year.txt} and The PGFPlots source to generate this figure is located in \inlinecode{tex\-/src\-/figure\--tools\--per\--year\-.tex}.
If another plotting tool was desired (for example Python's Matplotlib, or Gnuplot), the built graphic file (for example \inlinecode{tools-per-year.pdf}) would be the target instead.
The file \inlinecode{tools-per-year.txt} is a value-added table with only \menkenumyears{} rows (one row for every year).
The original dataset had \menkenumorigrows{} rows (one row for each year of each journal).
We see in Figure \ref{fig:datalineage} that it is defined as a Make \emph{target} in \inlinecode{demo-plot.mk} and that its prerequisite is \inlinecode{menke20-table-3.txt} (schematically shown by the arrow connecting them).
Both the row counts mentioned at the start of this paragraph are again macros.
In Figure \ref{fig:datalineage}, we see that \inlinecode{menke20-table-3.txt} is a target in \inlinecode{format.mk} and its prerequisite is the input file \inlinecode{menke20.xlsx} (XLSX I/O is used for the conversion).
The input files (which come from outside the project) are all \emph{targets} in \inlinecode{download.mk} and futher discussed in Section \ref{sec:download}.
\subsubsection{Importing and validating inputs (\inlinecode{download.mk})}
\label{sec:download}
The \inlinecode{download.mk} subMakefile is present in all projects, containing common steps for importing the input dataset(s).
All necessary datasets are imported through this subMakefile, irrespective of where the dataset is \emph{used}.
The relation between the project and the outside world is maintained in this single subMakefile, aiming at modularity (\ref{principle:modularity}) minimal complexity (\ref{principle:complexity}) and internet security.
Each external dataset has some basic information, including its expected name on the local system (for offline access), a checksum to validate it (either the whole file or just its main ``data'', as discussed in Section \ref{sec:outputverification}), and its URL/PID.
In Maneage, they are stored in the \inlinecode{INPUTS.conf} file.
See Figure \ref{fig:datalineage} for the position of \inlinecode{INPUTS.conf} in the project's file structure and data lineage, respectively.
Each is stored as a Make variable, and is automatically loaded into the full project when Make starts, like other configuration files, usable in any subMakefile.
\subsubsection{Configuration files}
\label{sec:configfiles}
The subMakefiles discussed above should only organize the analysis, they should not contain any fixed numbers, settings or parameters, which should instead be set as variables in configuration files.
Configuration files logically separate the low-level implementation from the high-level running of a project.
In the data lineage plot of Figure \ref{fig:datalineage}, configuration files are shown as sharp-edged, green \inlinecode{*.conf} boxes in the top row (for example, the file \inlinecode{INPUTS.conf} that was mentioned in Section \ref{sec:download}).
All the configuration files of a project are placed under the \inlinecode{reproduce/analysis/config} subdirectory, and are loaded into \inlinecode{top-make.mk} before any of the subMakefiles, hence they are available to all of them.
The example analysis in Section \ref{sec:analysis}, in which we reported the number of papers studied by M20 in \menkenumpapersdemoyear, illustrates this.
The year ``\menkenumpapersdemoyear'' is not written by hand in \inlinecode{demo-plot.mk}.
It is referenced through the \inlinecode{menke-year-demo} variable, which is defined in \inlinecode{menke-demo-year.conf}, which is a prerequisite of the \inlinecode{demo\--plot\-.tex} rule, see it in Figure \ref{fig:datalineage}.
If we wished to report the number in a different year, it would be sufficient to change the value in \inlinecode{menke-demo-year.conf}.
A configuration file is a prerequisite of the target that uses it, so after the change, its timestamp will be newer than \inlinecode{demo-plot.tex}.
Thus, Make will re-execute the recipe to generate the macro file before this paper is re-built and the corresponding year and value will be updated in this paper, always in synchronization with each other and no matter how many times they are used.
Combined with the fact that all source files in Maneage are under version control, this encourages testing of various settings of the
analysis as the project evolves in the case of exploratory research papers, and better self-consistency in hypothesis testing papers.
\subsubsection{Project initialization (\inlinecode{initialize.mk})}
\label{sec:initialize}
The \inlinecode{initial\-ize\-.mk} subMakefile is present in all projects and is the first subMakefile that is loaded into \inlinecode{top-make.mk} (see Figures \ref{fig:datalineage}).
It does not contain any analysis or major processing steps, it just initializes the system by setting the necessary Make environment as well as other general jobs like defining the Git commit hash of the run as a \LaTeX{} (\inlinecode{\textbackslash{}projectversion}) macro that can be loaded into the narrative.
Papers using Maneage usually put this hash as the last word in their abstract, for example, see \citet{akhlaghi19} and \citet{infante20}.
For the current version of this paper, it expands to \projectversion.
\subsection{Projects as Git branches of Maneage}
\label{sec:projectgit}
Maneage projects are primarily stored as plain-text files.
It can thus be efficiently maintained under version control systems (currently using Git).
Every commit in the version-controlled history contains \emph{a complete} snapshot of the data lineage (see the completeness principle \ref{principle:complete}).
Maneage is maintained by its developers in a central branch, \inlinecode{man\-eage}.
The \inlinecode{man\-eage} branch contains all the low-level infrastructure, a skeleton, that is needed by any new project.
As shown in Section \ref{sec:maneage} new projects start by cloning \inlinecode{man\-eage} and customizing their own Git branch, or fork.
Figure \ref{fig:branching}(a) shows how a project has started by branching off commit \inlinecode{0c120cb}.
%% Exact URLs of imported images.
%% Collaboration icon: https://www.flaticon.com/free-icon/collaboration_809522
%% Paper done: https://www.flaticon.com/free-icon/file_2521838
%% Paper processing: https://www.flaticon.com/free-icon/file_2521989
\begin{figure}[t]
\includetikz{figure-branching}
\vspace{-3mm}
\caption{\label{fig:branching} Harvesting the power of version-control in project management with Maneage.
Maneage is maintained as a core branch, with projects created by branching off it.
(a) shows how projects evolve on their own branch, but can always update their low-level structure by merging with the core branch
(b) shows how a finished/published project can be revitalized for new technologies simply by merging with the core branch.
Each Git ``commit'' is shown on their branches as colored ellipses, with their hash printed in them.
The commits are colored based on the team that is working on that branch.
The collaboration and paper icons are respectively made by `mynamepong' and `iconixar' and downloaded from \url{www.flaticon.com}.
}
\end{figure}
After a project starts, Maneage will evolve with new features or fixed bugs.
Because all projects branch from it, updating the project's low-level skeleton is as easy as merging the \inlinecode{maneage} branch into the project's branch.
For example, in Figure \ref{fig:branching}(a), see how Maneage's \inlinecode{3c05235} commit has been merged into the project's branch in commit \inlinecode{2ed0c82}.
Hence infrastructure improvements and fixes are easily propagated to all projects.
Another useful scenario is reviving a finished/published project at a later date, possibly by other researchers as shown in Figure \ref{fig:branching}(b), e.g., assuming the original project was completed years ago, and is no longer directly executable.
Other scenarios include projects that are created by merging various other projects.
Modern version control systems provide many more capabilities that can be leveraged through Maneage in project management, thanks to the shared branch it has with \emph{all} derived projects, and thanks to its completeness (\ref{principle:complete}).
\subsection{Multi-user collaboration on single build directory}
\label{sec:collaborating}
Because the project's source and build directories are separate, an option is provided for different users to share a build directory, while working on their own separate project branches during a collaboration.
This is similar to the parallel branch that is later merged in Figure \ref{fig:branching}(a).
To enable this mode, the \inlinecode{./project} script has an option \inlinecode{--group} that must be given the name of a (POSIX) user group in the host OS.
All built files are then automatically assigned to this user group, with read and write permissions for all members.
Permission management and avoiding conflicts in the build directory (while members work on different branches) is the responsibility of the team.
\subsection{Publishing the project}
\label{sec:publishing}
In a scientific scenario, the final report is submitted to a journal, while in an industrial context it is submitted to the customers or employers.
To facilitate publication of the project's source with the narrative, Maneage has a \inlinecode{dist} target, which is activated with \inlinecode{./project make dist}.
In this mode, Maneage will not do any analysis, but will instead put full project's source (for the given commit, without the version history), with all the built files that are necessary for \LaTeX{}, into a compressed \inlinecode{.tar.gz} file.
This is useful for publishers to create the report without necessarily building the full project: since the full project source is included, it can be rebuilt.
The \inlinecode{dist-zip} target provides Zip compression as an alternative.
Depending on the built graphics used in the report, this compressed file will usually be roughly a mega-byte.
However, the required inputs and the outputs may be much bigger, from megabytes to petabytes.
This gives two scenarios for publication of the project: 1) publishing only the source, or 2) publishing the source with the data.
In the former case, the output of \inlinecode{dist} can be submitted to the journal as a supplement, or uploaded to pre-print servers like \href{https://arXiv.org}{arXiv} that will compile the \LaTeX{} source and build their own PDFs.
The Git history can also be archived as a single ``bundle'' file and submitted as a supplement.
When publishing with datasets, the project's outputs, and/or inputs, can be published on servers like Zenodo.
For example, \citet[\href{https://doi.org/10.5281/zenodo.3408481}{zenodo.3408481}]{akhlaghi19} uploaded all the project's required software tarballs (mentioned in the acknowledgements) and its final PDF, along with the project's source and a Git ``bundle''.
\section{Discussion \& Caveats}
\label{sec:discussion}
To optimally extract the potentials of big data in science, we need to have a complete view of its lineage.
Scientists are, however, rarely trained sufficiently in data management or software development, and the plethora of high-level tools that change every few years does not help.
Such high-level tools are primarily targetted at software developers, who are paid to learn them and use them effectively for short-term projects.
Scientists, on the other hand, need to focus on their own research fields, and need to think about longevity.
The primordial implementation was written for \citet{akhlaghi15}.
To use in other projects without a full re-write, the skeleton was separated from the flesh as a more abstract ``template'' that was used in \citet{bacon17}, in particular Sections 4 and 7.3 (respectively in \href{http://doi.org/10.5281/zenodo.1163746}{zenodo.1163746} and \href{http://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}).
Later, software building was incorporated and used in \citet[\href{https://doi.org/10.5281/zenodo.3408481}{zenodo.3408481}]{akhlaghi19} and \citet[\href{https://doi.org/10.5281/zenodo.3524937}{zenodo.3524937}]{infante20}.
After this paper is published, bugs will still be found and Maneage will continue to evolve and improve, significant changes from this paper will be listed in \inlinecode{README-hacking.md}.
Adoption of Maneage projects on a wide scale will make it possible to feed these into machine learning (ML) tools for automatic workflow generation, optimized for desired characteristics of the results.
Because Maneage is complete, algorithms and data selection methods can be optimized and by connecting the analysis directly to the narrative and history of a project, natural language processing can be studied.
Parsers can be written over Maneage-derived projects for meta-research and data provenance studies, for example to generate ``research objects''.
As another example, when a bug is found in one software package, all affected projects can be found and the scale of the effect can be measured.
Combined with SoftwareHeritage, precise high-level science parts of Maneage projects can be accurately cited (e.g., failed/abandoned tests at any historical point).
Many components of ``machine-actionable'' data management plans \citep{miksa19b} can be automatically filled out by Maneage, which is useful for project PIs and and grant funders.
Maneage was awarded a Research Data Alliance (RDA) adoption grant for implementing the recommendations of the Publishing Data Workflows working group \citep{austin17}.
Maneage's user base and development grew phenomenally, highlighting caveats.
Firstly, Maneage uses very low-level tools that are not widely used by scientists, e.g., Git, \LaTeX, Make and the command line.
This is primarily because of a lack of exposure.
Witnessing the improvements in their research, many (especially early career researchers) have started mastering these tools as they adopt Maneage.
We are thus working on tutorials and improving documentation.
Secondly, the many software packages used on various POSIX-compatible systems require maintenance.
However, because Maneage builds its software in the same Make framework as the analysis, users' experience in the analysis empowers them to add/fix their required software with the same Make tools.
This has already happened, with improvements contributed to the core Maneage branch, propagating to all projects.
Thirdly, publishing a project's reproducible data lineage immediately after publication enables others to continue with followup papers in competition with the original authors.
We propose these solutions:
1) Through the Git history, the work added by another team at any phase of the project can be quantified, contributing to a new concept of authorship in scientific projects and helping to quantify Newton's famous ``\emph{standing on the shoulders of giants}'' quote.
This is a long-term goal and requires major changes to academic value systems.
2) Authors can be given a grace period where the journal or a third party embargoes the source, keeping it private for the embargo period and then publishing it.
%% Acknowledgements
\section*{Acknowledgments}
The authors wish to thank (sorted alphabetically)
Julia Aguilar-Cabello,
Alice Allen,
Pedram Ashofteh Ardakani,
Roland Bacon,
Surena Fatemi,
Fabrizio Gagliardi,
Konrad Hinsen,
Mohammad-reza Khellat,
Johan Knapen,
Tamara Kovazh,
Ryan O'Connor,
Simon Portegies Zwart,
Idafen Santana-P\'erez,
Elham Saremi,
Yahya Sefidbakht,
Zahra Sharbaf,
Nadia Tonello,
and Ignacio Trujillo
for their useful help, suggestions and feedback on Maneage and this paper.
Work on Maneage, and this paper, has been partially funded/supported by the following institutions:
The Japanese Ministry of Education, Culture, Sports, Science, and Technology ({\small MEXT}) PhD scholarship to M. Akhl\-aghi and its Grant-in-Aid for Scientific Research (21244012, 24253003).
The European Research Council (ERC) advanced grant 339659-MUSICOS.
The European Union (EU) Horizon 2020 (H2020) research and innovation programmes No 777388 under RDA EU 4.0 project, and Marie Sk\l{}odowska-Curie grant agreement No 721463 to the SUNDIAL ITN.
The State Research Agency (AEI) of the Spanish Ministry of Science, Innovation and Universities (MCIU) and the European Regional Development Fund (ERDF) under the grant AYA2016-76219-P.
The IAC project P/300724, financed by the MCIU, through the Canary Islands Department of Economy, Knowledge and Employment.
The Fundaci\'on BBVA under its 2017 programme of assistance to scientific research groups, for the project ``Using machine-learning techniques to drag galaxies from the noise in deep imaging''.
The ``A next-generation worldwide quantum sensor network with optical atomic clocks'' project of the TEAM IV programme of the Foundation for Polish Science co-financed by the EU under ERDF.
The Polish MNiSW grant DIR/WK/2018/12.
The Pozna\'n Supercomputing and Networking Center (PSNC) computational grant 314.
\input{tex/build/macros/dependencies.tex}
\section*{Competing Interests}
The authors have no competing interests to declare.
\section*{Author Contributions}
\begin{enumerate}
\item Mohammad Akhlaghi: principal author of the Maneage source code and this paper, also principal investigator (PI) of the RDA Adoption grant awarded to Maneage.
\item Ra\'ul Infante-Sainz: contributed many commits to the source of Maneage, also involved in early testing and writing this paper.
\item Boudewijn F. Roukema: involved in Maneage and its testing, contributed to writing this paper.
\item David Valls-Gabaud: involved in the Maneage project and its testing and contributed to writing this paper.
\item Roberto Baena-Gall\'e: contributed to early testing of Maneage and in writing this paper.
\end{enumerate}
%% Tell BibLaTeX to put the bibliography list here.
\printbibliography
%% Finish LaTeX
\end{document}
%% This file is part of Maneage (https://maneage.org).
%
%% This file is part of Maneage. Maneage is free software: you can
%% redistribute it and/or modify it under the terms of the GNU General
%% Public License as published by the Free Software Foundation, either
%% version 3 of the License, or (at your option) any later version.
%
%% Maneage is distributed in the hope that it will be useful, but WITHOUT
%% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
%% FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
%% more details. See <http://www.gnu.org/licenses/>.
|