aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMohammad Akhlaghi <mohammad@akhlaghi.org>2020-06-17 05:16:58 +0100
committerMohammad Akhlaghi <mohammad@akhlaghi.org>2020-06-17 05:16:58 +0100
commite543088567ce06626f5b94219d63bddafb1d8d6f (patch)
tree5b497ce257c91ebc04d86823b5e739850f9bd452
parent11696587813b77a79ae72da9b3ea01f602ceb231 (diff)
First draft of short slides
A first draft of the summarized slides was written for the talk in the RDA Adoption week.
-rw-r--r--img/wds.jpgbin0 -> 67599 bytes
-rw-r--r--slides-intro-short.tex916
2 files changed, 913 insertions, 3 deletions
diff --git a/img/wds.jpg b/img/wds.jpg
new file mode 100644
index 0000000..032fb1d
--- /dev/null
+++ b/img/wds.jpg
Binary files differ
diff --git a/slides-intro-short.tex b/slides-intro-short.tex
index b5e680a..88c14aa 100644
--- a/slides-intro-short.tex
+++ b/slides-intro-short.tex
@@ -34,11 +34,13 @@
(\textcolor{blue}{\href{https://arxiv.org/abs/#1}{arXiv:#1}})}}
%% Set the title
-\title{\huge\textbf{BIG} Data, \textbf{BIG} responsibility
- \\ {\normalsize Introducing \emph{Maneage}: customizable framework for \emph{man}aging data lin\emph{eage}}}
+\title{{\huge\textbf{BIG} Data, \textbf{BIG} responsibility} \\
+ {\LARGE Towards Long-term and Archivable Reproducibility} \\
+ \vspace{2mm}{\small [RDA Europe Adoption grant recipient. Submitted to \href{https://www.computer.org/csdl/magazine/cs}{IEEE CiSE} (\textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}), Comments welcome]}
+}
%% Set the author
-\author{\vspace{1cm}\\
+\author{\vspace{8mm}\\
\href{https://akhlaghi.org}{Mohammad Akhlaghi}\\\vspace{0.5mm}
\footnotesize
Instituto de Astrof\'isica de Canarias ({\scriptsize IAC}), Tenerife, Spain
@@ -88,4 +90,912 @@
\titlepage
\end{frame}
+
+ \begin{frame}{Challenges of the RDA-WDS Publishing Data Workflows WG {\small (DOI:\href{https://doi.org/10.1007/s00799-016-0178-2}{10.1007/s00799-016-0178-2})}}
+ Challenges (also relevant to researchers, not just repositories)
+ \begin{itemize}
+ \item \emph{Bi-directional linking}: how to \alert{link data and publications}.
+ \item \emph{\alert{Software management}:} how to manage, preserve, publish and cite software?
+ \item \emph{Metrics:} \alert{how often} are data used.
+ \item \emph{Incentives to researchers:} how to \alert{communicate benefits} of following good practices \alert{to researchers}.
+ \end{itemize}
+
+ \begin{center}
+ \includegraphics[width=4cm]{img/rda.png}\hspace{1cm}
+ \includegraphics[width=4cm]{img/wds.jpg}
+ \end{center}
+
+ \pause
+ ``\emph{We would like to see a workflow that results in all
+ \textcolor{blue!30!green}{\bf scholarly objects being connected},
+ linked, citable, and persistent to allow researchers to navigate
+ smoothly and to \alert{\bf enable reproducible research}. This
+ includes \alert{{\bf linkages} between documentation, code, data, and
+ journal articles in an integrated environment}. Furthermore,
+ in the ideal workflow, all of these objects need to be
+ \alert{\bf well documented} to enable other researchers (or
+ citizen scientists etc) to reuse the data for new
+ discoveries.}''
+ \end{frame}
+
+\usebackgroundtemplate{ } %% undeclare it
+
+
+ \newcommand{\allopacity}{1}
+ \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+ \newcommand{\paperinit}{}
+ \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+ \newcommand{\sver}{}
+ \newcommand{\srep}{}
+ \newcommand{\dver}{}
+ \newcommand{\ddver}{}
+ \newcommand{\confopt}{}
+ \newcommand{\confenv}{}
+ \newcommand{\containers}{}
+ \newcommand{\db}{}
+ \newcommand{\calib}{}
+ \newcommand{\corr}{}
+ \newcommand{\runord}{}
+ \newcommand{\runopt}{}
+ \newcommand{\humanerr}{}
+ \newcommand{\confirmbias}{}
+ \newcommand{\depupdate}{}
+ \newcommand{\coauth}{}
+ \newcommand{\varsinpaper}{}
+ \newcommand{\recordinfo}{}
+ \newcommand{\softcite}{}
+ \newcommand{\prevchange}{}
+ \newcommand{\paperfinal}{}
+ \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+
+ %% Don't show the happy scientist or the existing containers box.
+ \let\paperinit\undefined
+ \let\allopacity\undefined
+ \let\paperfinal\undefined
+ \let\containers\undefined
+
+
+
+
+
+ \begin{frame}{Science is a tricky business}
+ \begin{center}
+ \includegraphics[width=0.9\linewidth]{img/nature-cartoon.jpg}
+ \end{center}
+
+ \vspace{-0.3cm}\hfill
+ {\tiny Image from nature.com
+ (``\href{https://www.nature.com/articles/d41586-017-07522-z}{Five
+ ways to fix statistics}'', Nov 2017)}\hspace{7mm}
+
+ \vspace{-1mm}
+ \begin{tcolorbox}[boxsep=0pt,left=1mm,right=1mm,top=1mm,bottom=1mm]
+ \small Data analysis [...] is a \alert{human
+ behavior}. Researchers who hunt hard enough will turn up a
+ result that fits statistical criteria, but their
+ \alert{discovery} will probably be a \alert{false positive}.
+
+ \hfill Five ways to fix statistics, Nature, 551, Nov 2017.
+ \end{tcolorbox}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}{Founding criteria}
+ \begin{tcolorbox}[title=Basic/simple principle:]
+ \centering Science is defined by its METHOD, \alert{not} its
+ result.
+ \end{tcolorbox}
+
+ \pause
+ \begin{itemize}
+ \item \textbf{Complete/self-contained:}
+ \begin{itemize}
+ \item \alert{Only dependency} should be \alert{POSIX} tools \textcolor{gray}{(discards Conda or Jupyter which need Python)}.
+ \item Must \alert{not require root} permissions \textcolor{gray}{(discards tools like Docker or Nix/Guix)}.
+ \item Should be \alert{non-interactive} or runnable in batch (user interaction is an incompleteness).
+ \item Should be usable \alert{without internet} connection.
+ \end{itemize}
+
+ \pause
+ \item \textbf{Modularity:} Parts of the project should be \alert{re-usable} in other projects.
+ \pause
+ \item \textbf{Plain text:} Project's source should be in \alert{plain-text} \textcolor{gray}{(binary formats need special software)}
+ \begin{itemize}
+ \item This includes high-level analysis.
+ \item It is easily publishable (very low volume of $\times100$KB), archivable, and parse-able.
+ \item \alert{Version control} (e.g., with Git) can track project's history.
+ \end{itemize}
+ \pause
+ \item \textbf{Minimal complexity:} Occum’s rasor: “Never posit pluralities without necessity”.
+ \begin{itemize}
+ \item Avoiding the \alert{fashionable} tool of the day: tomorrow another tool will take its place!
+ \item Easier \alert{learning curve}, also doesn't create a \alert{generational gap}.
+ \item Is \alert{compatible} and \alert{extensible}.
+ \end{itemize}
+ \pause
+ \item \textbf{Verifable inputs and outputs:} Inputs and Outputs must be \alert{automatically verified}.
+ \pause
+ \item \textbf{Free and open source software:} \alert{Free software} is essential: non-free software is not configurable, not distributable, and dependent on non-free provider (which may discontinue it in N years).
+ \end{itemize}
+ \end{frame}
+
+
+
+ \newcommand{\focusonpackages}{}
+ \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+ \let\focusonpackages\undefined
+
+
+
+
+
+ \begin{frame}{Predefined/exact software tools}
+ \small
+ \begin{columns}
+ \column{10cm}
+ \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt,
+ top=1pt, bottom=1pt, title=Reproducibility \&
+ software]
+ \footnotesize Reproducing the environment (specific
+ \alert{software versions}, \alert{build instructions} and
+ \alert{dependencies}) is also critically important for
+ reproducibility.
+ \end{tcolorbox}
+
+ \vspace{2cm}
+
+ \begin{itemize}
+ \setlength\itemsep{0.6cm}
+ \item \emph{Containers} or \emph{Virtual Machines} are a
+ \alert{binary black box}.
+
+ \item Maneage \alert{installs fixed versions} of all
+ necessary research software and their dependencies.
+
+ \item Installs similar environment on \alert{GNU/Linux}, or
+ \alert{macOS} systems.
+
+ \item Works very much like a package manager (e.g.,
+ \alert{\texttt{apt}} or \alert{\texttt{brew}}).
+ \end{itemize}
+
+ \column{5cm}
+ \includegraphics[width=\linewidth]{img/version.png}
+ \end{columns}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}{Predefined/exact software tools}
+ \small
+ \begin{columns}
+ \column{10cm}
+ \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt,
+ top=1pt, bottom=1pt, title=Reproducibility \&
+ software]
+ \footnotesize Reproducing the environment (specific
+ \alert{software versions}, \alert{build instructions} and
+ \alert{dependencies}) is also critically important for
+ reproducibility.
+ \end{tcolorbox}
+
+ \vspace{2cm}
+
+ \begin{itemize}
+ \setlength\itemsep{0.6cm}
+ \item \emph{Containers} or \emph{Virtual Machines} are a
+ \alert{binary black box}.
+
+ \item Maneage \alert{installs fixed versions} of all
+ necessary research software and their dependencies.
+
+ \item Installs similar environment on \alert{GNU/Linux}, or
+ \alert{macOS} systems.
+
+ \item Works very much like a package manager (e.g.,
+ \alert{\texttt{apt}} or \alert{\texttt{brew}}).
+ \end{itemize}
+
+ \column{5cm}
+ \includegraphics[width=\linewidth]{img/version-highlighted.png}
+ \end{columns}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}{Controlled environment and build instructions}
+ \small
+ \begin{columns}
+ \column{5.5cm}
+ \includegraphics[width=0.9\linewidth]{img/env.png}
+ \column{5.5cm}
+ \includegraphics[width=0.9\linewidth]{img/build.png}
+ \end{columns}
+ \end{frame}
+
+ \begin{frame}{Controlled environment and build instructions}
+ \small
+ \begin{columns}
+ \column{5.5cm}
+ \includegraphics[width=0.9\linewidth]{img/env-highlighted.png}
+ \column{5.5cm}
+ \includegraphics[width=0.9\linewidth]{img/build-highlighted.png}
+ \end{columns}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}{Example: Matplotlib (a Python visualization library) build dependencies}
+ \Wider[5em]{
+ %\vspace{5mm}
+ \begin{center}
+ \includegraphics[width=0.9\linewidth]{img/matplotlib.png}
+ \end{center}
+
+ \vspace{3mm}\tiny From ``Attributing and Referencing (Research)
+ Software: Best Practices and Outlook from Inria'' (Alliez et
+ al. 2019,
+ \textcolor{blue}{\href{https://hal.archives-ouvertes.fr/hal-02135891}{hal-02135891}})
+ }
+ \end{frame}
+
+
+
+
+
+
+
+ \begin{frame}{Advantages of this build system}
+ \begin{columns}
+ \column{11cm}
+ \begin{itemize}
+ \setlength\itemsep{0.7cm}
+ \item Project runs in fixed/controlled environment: custom build
+ of \alert{Bash}, \alert{Make}, GNU Coreutils
+ (\alert{\texttt{ls}}, \alert{\texttt{cp}},
+ \alert{\texttt{mkdir}} and etc), \alert{AWK}, or \alert{SED},
+ \alert{\LaTeX}, etc.
+ \item No need for \alert{root}/administrator \alert{permissions}
+ (on servers or super computers).
+ \item Whole system is built \alert{automatically} on any
+ Unix-like operating system \\(less 2 hours).
+ \item Dependencies of different projects will \alert{not conflict}.
+ \item Everything in \alert{plain text} (human \& computer
+ readable/archivable).
+ \end{itemize}
+ \column{4cm}
+ \includegraphics[width=\linewidth]{img/unchained.jpg}\\
+ \tiny \url{https://natemowry2.wordpress.com}
+ \end{columns}
+ \end{frame}
+
+
+ \begin{frame}{Software citation automatically generated in paper (including Astropy)}
+ \centering
+ \includegraphics[width=0.8\linewidth]{img/software-cite.jpg}
+ \end{frame}
+ \begin{frame}{Software citation automatically generated in paper (including Astropy)}
+ \centering
+ \includegraphics[width=0.8\linewidth]{img/software-cite-highlighted.jpg}
+ \end{frame}
+
+ %% Hardware/data
+ \newcommand{\focusonhardware}{}
+ \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+ \let\focusonhardware\undefined
+
+ \begin{frame}{Input data source and integrity is documented and checked}
+ \small
+ \begin{columns}
+ \column{10cm}
+ Stored information about each input file:
+ \begin{itemize}
+ \item \alert{PID} (where available).
+ \item Download \alert{URL}.
+ \item \alert{MD5}-sum to check integrity.
+ \end{itemize}
+
+ \vspace{0.75cm} All inputs are \alert{downloaded} from the given
+ PID/URL when necessary\\(during the analysis).
+
+ \vspace{0.75cm} MD5-sums are \alert{checked} to make sure the
+ download was done properly or the file is the same (hasn't
+ changed on the server/source).
+
+ \vspace{0.75cm}Example from the reproducible paper \textcolor{blue}{\href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230}}.\\
+ This paper needs three input files (two images, one catalog).
+
+ \column{5cm}
+ \includegraphics[width=\linewidth]{img/inputs.png}
+ \end{columns}
+ \end{frame}
+
+ \begin{frame}{Input data source and integrity is documented and checked}
+ \small
+ \begin{columns}
+ \column{10cm}
+ Stored information about each input file:
+ \begin{itemize}
+ \item \alert{PID} (where available).
+ \item Download \alert{URL}.
+ \item \alert{MD5}-sum to check integrity.
+ \end{itemize}
+
+ \vspace{0.75cm} All inputs are \alert{downloaded} from the given
+ PID/URL when necessary\\(during the analysis).
+
+ \vspace{0.75cm} MD5-sums are \alert{checked} to make sure the
+ download was done properly or the file is the same (hasn't
+ changed on the server/source).
+
+ \vspace{0.75cm}Example from the reproducible paper \textcolor{blue}{\href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230}}.\\
+ This paper needs three input files (two images, one catalog).
+
+ \column{5cm}
+ \includegraphics[width=\linewidth]{img/inputs-highlighted.png}
+ \end{columns}
+ \end{frame}
+
+
+
+
+
+
+
+
+
+
+
+ %% Analysis
+ \newcommand{\focusonrun}{}
+ \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+ \let\focusonrun\undefined
+
+
+
+
+
+ \begin{frame}{Reproducible science: Maneage is managed through a Makefile}
+ \small
+ \begin{columns}
+ \column{10cm}
+
+ All steps (downloading and analysis) are managed by Makefiles\\
+ (example from
+ \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}):
+
+ \vspace{5mm}
+ \begin{itemize}
+ \setlength\itemsep{0.7cm}
+ \item Unlike a script which always starts from the top, a
+ Makefile \alert{starts from the end} and steps that don't
+ change will be left untouched (not remade).
+ \item A single \emph{rule} can \alert{manage any number of
+ files}.
+ \item Make can identify independent steps internally and do them
+ in \alert{parallel}.
+ \item Make was \alert{designed for complex projects} with
+ thousands of files (all major Unix-like components), so it is
+ highly evolved and efficient.
+ \item Make is a very \alert{simple} and \alert{small} language,
+ thus easy to learn with great and free documentation (for
+ example
+ \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU
+ Make's manual}}).
+ \end{itemize}
+
+ \column{5cm}
+ \includegraphics[width=\linewidth]{img/reproducible-makefile.png}
+ \end{columns}
+ \end{frame}
+ \begin{frame}{Reproducible science: Maneage is managed through a Makefile}
+ \small
+ \begin{columns}
+ \column{10cm}
+
+ All steps (downloading and analysis) are managed by Makefiles\\
+ (example from
+ \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}):
+
+ \vspace{5mm}
+ \begin{itemize}
+ \setlength\itemsep{0.7cm}
+ \item Unlike a script which always starts from the top, a
+ Makefile \alert{starts from the end} and steps that don't
+ change will be left untouched (not remade).
+ \item A single \emph{rule} can \alert{manage any number of
+ files}.
+ \item Make can identify independent steps internally and do them
+ in \alert{parallel}.
+ \item Make was \alert{designed for complex projects} with
+ thousands of files (all major Unix-like components), so it is
+ highly evolved and efficient.
+ \item Make is a very \alert{simple} and \alert{small} language,
+ thus easy to learn with great and free documentation (for
+ example
+ \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU
+ Make's manual}}).
+ \end{itemize}
+
+ \column{5cm}
+ \includegraphics[width=\linewidth]{img/reproducible-makefile-highlighted-1.png}
+ \end{columns}
+ \end{frame}
+ \begin{frame}{Reproducible science: Maneage is managed through a Makefile}
+ \small
+ \begin{columns}
+ \column{10cm}
+
+ All steps (downloading and analysis) are managed by Makefiles\\
+ (example from
+ \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}):
+
+ \vspace{5mm}
+ \begin{itemize}
+ \setlength\itemsep{0.7cm}
+ \item Unlike a script which always starts from the top, a
+ Makefile \alert{starts from the end} and steps that don't
+ change will be left untouched (not remade).
+ \item A single \emph{rule} can \alert{manage any number of
+ files}.
+ \item Make can identify independent steps internally and do them
+ in \alert{parallel}.
+ \item Make was \alert{designed for complex projects} with
+ thousands of files (all major Unix-like components), so it is
+ highly evolved and efficient.
+ \item Make is a very \alert{simple} and \alert{small} language,
+ thus easy to learn with great and free documentation (for
+ example
+ \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU
+ Make's manual}}).
+ \end{itemize}
+
+ \column{5cm}
+ \includegraphics[width=\linewidth]{img/reproducible-makefile-highlighted-2.png}
+ \end{columns}
+ \end{frame}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ \newcommand{\focusonpaper}{}
+ \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+ \let\focusonpaper\undefined
+
+
+ \begin{frame}{Values in final report/paper}
+ All analysis \alert{results} (numbers, plots, tables) written in
+ paper's PDF as \alert{\LaTeX{} macros}. They are thus
+ \alert{updated automatically} on any change.\\ Shown here is a
+ portion of the \textsf{NoiseChisel} paper and its \LaTeX{} source
+ (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}).
+
+ \vspace{0.4cm}
+ \includegraphics[width=\linewidth]{img/reproducible-latex.png}
+ \end{frame}
+
+ \begin{frame}{Values in final report/paper}
+ All analysis \alert{results} (numbers, plots, tables) written in
+ paper's PDF as \alert{\LaTeX{} macros}. They are thus
+ \alert{updated automatically} on any change.\\ Shown here is a
+ portion of the \textsf{NoiseChisel} paper and its \LaTeX{} source
+ (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}).
+
+ \vspace{0.4cm}
+ \includegraphics[width=\linewidth]{img/reproducible-latex-highlighted.png}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}{Analysis step results/values concatenated into a single file.}
+ All \LaTeX{} macros come from a \alert{single file}.
+ \begin{center}
+ \includegraphics[width=0.6\linewidth]{img/reproducible-macros.png}
+ \end{center}
+ \end{frame}
+ \begin{frame}{Analysis step results/values concatenated into a single file.}
+ All \LaTeX{} macros come from a \alert{single file}.
+ \begin{center}
+ \includegraphics[width=0.6\linewidth]{img/reproducible-macros-highlighted.png}
+ \end{center}
+ \end{frame}
+
+
+
+
+
+
+
+ \begin{frame}{Analysis results stored as \LaTeX{} macros}
+ The analysis scripts write/update the \LaTeX{} macro values
+ automatically.
+ \begin{center}
+ \includegraphics[width=0.6\linewidth]{img/reproducible-write-macro.png}
+ \end{center}
+ \end{frame}
+ \begin{frame}{Analysis results stored as \LaTeX{} macros}
+ The analysis scripts write/update the \LaTeX{} macro values
+ automatically.
+ \begin{center}
+ \includegraphics[width=0.6\linewidth]{img/reproducible-write-macro-highlight.png}
+ \end{center}
+ \end{frame}
+
+
+ %% Make demo.
+ \begin{frame}
+ \LARGE
+ \vspace{1cm}
+ \hfill Let's see how the analysis is managed in a hypothetical project...
+ \end{frame}
+ \makedemoslide{img/data-lineage-1.pdf}
+ {Makefiles (\texttt{\*.mk}) keep contextually separate parts of the project, all imported into \texttt{top-make.mk}}
+ \makedemoslide{img/data-lineage-2.pdf}
+ {The ultimate purpose of the project is to produce a paper/report (in PDF).}
+ \makedemoslide{img/data-lineage-3.pdf}
+ {The narrative description, typography and references are in \texttt{paper.tex} \& \texttt{references.tex}.}
+ \makedemoslide{img/data-lineage-4.pdf}
+ {Analysis outputs (blended into the PDF as \LaTeX{} macros) come from \texttt{project.tex}.}
+ \makedemoslide{img/data-lineage-5.pdf}
+ {But analysis outputs must first be \emph{verified} (with checksums) before entering the report/paper.}
+ \makedemoslide{img/data-lineage-6.pdf}
+ {Basic project info comes from \texttt{initialize.tex}.}
+ \makedemoslide{img/data-lineage-7.pdf}
+ {Reported values about the downloaded inputs come from \texttt{download.tex}.}
+ \makedemoslide{img/data-lineage-8.pdf}
+ {... for example the number of rows in the second input (a catalog) of the project.}
+ \makedemoslide{img/data-lineage-9.pdf}
+ {The URL to download \texttt{input2.dat}, and a checksum to validate it, are stored in \texttt{INPUTS.conf}.}
+ \makedemoslide{img/data-lineage-10.pdf}
+ {Reported values from first analysis steps stored in \texttt{analysis1.tex}.}
+ \makedemoslide{img/data-lineage-11.pdf}
+ {... for example the average of the numbers in \texttt{out-1b.dat}.}
+ \makedemoslide{img/data-lineage-12.pdf}
+ {But \texttt{out-1b.dat} itself depends on other files and a paramter (for example a multiple of sigma).}
+ \makedemoslide{img/data-lineage-13.pdf}
+ {\texttt{out-1a.dat} is built from a downloaded dataset.}
+ \makedemoslide{img/data-lineage-14.pdf}
+ {Download URL and checksum of \texttt{input1.dat} also stored in \texttt{INPUTS.conf}.}
+ \makedemoslide{img/data-lineage-15.pdf}
+ {Reported values from second analysis steps stored in \texttt{analysis2.tex}.}
+ \makedemoslide{img/data-lineage-16.pdf}
+ {... for example the number of selected rows in \texttt{out-2b.dat}.}
+ \makedemoslide{img/data-lineage-17.pdf}
+ {\texttt{out-2b.dat} is derived from \texttt{out-1b.dat} (for example, rejected some of \texttt{out-1b.dat}'s rows).}
+ \makedemoslide{img/data-lineage-18.pdf}
+ {Reported values from third analysis steps stored in \texttt{analysis3.tex}.}
+ \makedemoslide{img/data-lineage-19.pdf}
+ {... for example measurements from both \texttt{out-3a.dat} and \texttt{out-3b.dat}.}
+ \makedemoslide{img/data-lineage-20.pdf}
+ {\texttt{out-3b.dat} is generated from an analysis on \texttt{out-2a.dat}.}
+ \makedemoslide{img/data-lineage-21.pdf}
+ {But \texttt{out-2a.dat} itself is generated from \texttt{input1.dat} and an analysis which has two settings.}
+ \makedemoslide{img/data-lineage-22.pdf}
+ {\texttt{out-3a.dat} also depends on \texttt{out-1a.dat} and an analysis with needs one parameter.}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ \newcommand{\allopacity}{1}
+ \begin{frame}{All questions have an answer now (in
+ \alert{plain text}: human \& computer readable/archivable).}
+ \include{tex/project-graph} \end{frame}
+ \newcommand{\gitlogo}{}
+ \begin{frame}{All questions have an answer now (in
+ \alert{plain text}: so we can use Git to keep its history).}
+ \include{tex/project-graph}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+ \newcommand{\projinit}{}
+ \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+ \newcommand{\projwork}{}
+ \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+ \newcommand{\tempevolve}{}
+ \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+ \newcommand{\mergewithtemp}{}
+ \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+ \newcommand{\tofuture}{}
+ \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+ \newcommand{\githappy}{}
+ \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+ \newcommand{\gitverified}{}
+ \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+
+ \begin{frame}{Two recent examples (publishing Git checksum in abstract)}
+ \begin{columns}
+ \column{0.5\linewidth}
+ \centering
+ \includegraphics[width=0.8\linewidth]{img/firstpage-190911230.png}
+ \column{0.5\linewidth}
+ \centering
+ \includegraphics[width=0.8\linewidth]{img/firstpage-mnras491.png}
+ \end{columns}
+ \end{frame}
+
+ \begin{frame}{Two recent examples (publishing Git checksum in abstract)}
+ \begin{columns}
+ \column{0.5\linewidth}
+ \centering
+ \includegraphics[width=0.8\linewidth]{img/firstpage-190911230-highlighted.png}
+ \column{0.5\linewidth}
+ \centering
+ \includegraphics[width=0.8\linewidth]{img/firstpage-mnras491-highlighted.png}
+ \end{columns}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}{Publication of the project}
+
+ A reproducible project using Maneage will have the following
+ (\alert{plain text}) components:
+ \begin{itemize}
+ \item Makefiles.
+ \item \LaTeX{} source files.
+ \item Configuration files for software used in analysis.
+ \item Scripts/programming files (e.g., Python, Shell, AWK, C).
+ \end{itemize}
+ The \alert{volume} of the project's source will thus be
+ \alert{negligible} compared to a single figure in a paper
+ (usually $\sim100$ kilo-bytes).
+
+ \vspace{1cm} The project's pipeline (customized Maneage) can be
+ \alert{published} in
+ \begin{itemize}
+ \item \alert{arXiv}: uploaded with the \LaTeX{} source to always
+ stay with the paper \\(for example
+ \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). The
+ file containing all macros must also be uploaded so arXiv's
+ server can easily build the \LaTeX{} source.
+ \item \alert{Zenodo}: Along with all the input datasets (many
+ Gigabytes) and software \\(for example
+ \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.3408481}{zenodo.3408481}}) and given a unique DOI.
+ \end{itemize}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}{Project source and its execution}
+ \begin{tcolorbox}
+ Programs \textcolor{gray}{[here: Scientific projects]} must be
+ written for \alert{people to read}...
+
+ \hfill ...and only \emph{incidentally} for machines to
+ \emph{execute}.
+
+ \vspace{2mm}
+ \hfill \footnotesize Harold Abelson, Structure and Interpretation of Computer Programs
+ \end{tcolorbox}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}[t]{General outline of using this system (for example \href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230})}
+ \vspace{1cm}
+ \texttt{\$ git clone http://gitlab.com/makhlaghi/iau-symposium-355{ }{ }{ }{ }\textcolor{gray}{\# Import the project.}}\\
+
+ \pause
+ \vspace{1.5cm}
+ \texttt{\$ ./project configure { }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# You will specify the build directory on your system,}}\\
+ \texttt{{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# and it will build all software (about 1.5 hours).}}
+
+ \pause
+ \vspace{1.5cm}
+ \texttt{\$ ./project make { }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# Does all the analysis and makes final PDF.}}\\
+ \end{frame}
+
+
+
+
+
+
+ \begin{frame}{Future prospects...}
+ \large Adoption of reproducibility by many researchers will enable
+ the following:
+
+ \vspace{1em}
+ \begin{itemize}
+ \setlength\itemsep{3mm}
+ \item A repository for education/training \textcolor{gray}{(PhD
+ students, or researchers in other fields)}.
+ \item Easy \alert{verification}/\alert{understanding} of other
+ research projects \textcolor{gray}{(when necessary)}.
+ \item Trivially \alert{test} different steps of others' work
+ \textcolor{gray}{(different configurations, software and etc)}.
+ \item Science can progress \alert{incrementally}
+ \textcolor{gray}{(shorter papers actually building on each
+ other!)}.
+ \item \alert{Extract meta-data} after the publication of a dataset
+ \textcolor{gray}{(for future ontologies or vocabularies)}.
+ \item Applying \alert{machine learning} on reproducible research
+ projects will allow us to solve some Big Data Challenges:
+
+ \vspace{1em}
+ \begin{itemize}
+ \setlength\itemsep{2mm}
+ \item \emph{Extract the relevant parameters automatically}.
+ \item \emph{Translate the science to enormous samples}.
+ \item \emph{Believe the results when no one will have time to
+ reproduce}.
+ \item \emph{Have confidence in results derived using machine
+ learning or AI}.
+ \end{itemize}
+ \end{itemize}
+ \end{frame}
+
+\begin{frame}{Existing technologies (Independent environment)}
+ \begin{itemize}
+ \setlength\itemsep{7mm}
+ \item \textbf{Virtual machines:}
+ \begin{itemize}
+ \setlength\itemsep{3mm}
+ \item Contain the \alert{full operating system}, are thus very large ($\times$Gigabytes).
+ \item In \emph{binary} format (decoding a built VM's environment is extremely hard and inaccurate).
+ \end{itemize}
+ \item \textbf{Containers:} (For example Docker or Singularity)
+ \begin{itemize}
+ \setlength\itemsep{3mm}
+ \item Similar to virtual machines, but \alert{without low-level kernel} (use host's kernel).
+ \item \alert{Will fail} as soon as kernel is no longer supported\\(for example Docker currently only supports Linux kernel 3.10 and above \alert{from 2013}).
+ \item Good solutions for software engineers (that need to \emph{reproduce a bug's environment today}).
+ \item Docker is modular, needs root previlages (not available in HPCs), Dockerfiles allow incompleteness\\(especially in the common scenario of using the operating system's package manager, see next slide)
+ \item Singularity is monolithic and thus can be very large.
+ \item In \alert{binary} format (similar to VMs, especially when OS package managers are used).
+ \end{itemize}
+ \end{itemize}
+
+ \vspace{3mm}
+In summary, they only \alert{store a built} environment (they are outputs, not good for archiving).
+
+\end{frame}
+
+
+
+
+
+\begin{frame}{Existing technologies (Package managers)}
+
+ \begin{itemize}
+ \item \textbf{Operating system package managers:}
+ \begin{itemize}
+ \setlength\itemsep{2mm}
+ \item For example \texttt{apt} or \texttt{yum} for Debian-based and RedHat-based GNU/Linux operating systems\\(the most common way to install software).
+ \item Tightly intertwined with the operating system's components\\(arbitrary control of software versions is not easily possible).
+ \item Older software (for example +5 years) is usually removed.
+ \end{itemize}
+ \item \textbf{Conda/Anaconda:}
+ \begin{itemize}
+ \setlength\itemsep{2mm}
+ \item Conda has build instructions for software and their dependencies.
+ \item But it doesn't go down to the C library or the lower-level components of operating system.
+ \item It is written in Python (can't be used later when current Python is depreciated).
+ \item Authors of Uhse+2019\footnote{\url{http://dx.doi.org/10.1002/cppb.20097}} report\footnote{\url{https://github.com/conda-forge/conda-forge.github.io/issues/787}} that their Conda environment breaks roughly every 3 months\\(Conda environments need to be updated to be used later! Breaking reproducibility).
+ \end{itemize}
+ \item \textbf{Nix, or GNU Guix:}
+ \begin{itemize}
+ \setlength\itemsep{2mm}
+ \item Deliver perfectly reproducible builds (bit-wise reproducibility of software), needs root access.
+ \item Doesn't \emph{require} documentation of dependencies.
+ \end{itemize}
+ \item \textbf{Spack:} Similar to Nix/Guix but written in Python.
+ \end{itemize}
+\end{frame}
+
+\begin{frame}{Existing technologies (workflow tools)}
+ \begin{itemize}
+ \setlength\itemsep{4mm}
+ \item \textbf{Binder:} (\url{https://mybinder.org}) Docker+Conda.
+ \item \textbf{Galaxy:} (\url{https://galaxyproject.org}) A web-based user interface, primarily designed for genomics. The GUI make it hard to automate, and has too many dependencies. Very similar to GenePattern (2008 to 2017): with +40,000 users and $\sim4000$ jobs running per week, but cut due to funding.
+ \item \textbf{Sciunit:} (\url{https://sciunit.run}) Parses program binaries to try to infer their dependencies and copy them.
+ \item \textbf{Popper:} (\url{https://falsifiable.us}), HCL (previously used by GitHub Actions) + Conda + Docker.
+ \item \textbf{WholeTale:} (\url{https://wholetale.org}) Jupyter + Conda + Docker.
+ \item \textbf{Image Processing On Line (IPOL) journal:} The best example of publishing algorithms/methods I have seen, only useful for very basic/low-level software.
+ \end{itemize}
+ \alert{Summary}: except for IPOL, most solutions surveyed have far too many dependencies to be usable \alert{beyond the immediate future}.
+\end{frame}
+
+
+
+
+ \begin{frame}{Summary:}
+
+ Maneage is introduced as a customizable template that will do the
+ following steps/instructions (all in simple plain text files).
+ \begin{itemize}
+ \item \alert{Automatically downloads} the necessary
+ \emph{software} and \emph{data}.
+ \item \alert{Builds} the software in a \alert{closed
+ environment}.
+ \item Runs the software on data to \alert{generate} the final
+ \alert{research results}.
+ \item A modification in one part of the analysis will only
+ result in re-doing that part, not the whole project.
+ \item Using LaTeX macros, paper's figures, tables and numbers
+ will be \alert{Automatically updated} after a change in
+ analysis. Allowing the scientist to focus on the scientific
+ interpretation.
+ \item The whole project is under \alert{version control} (Git)
+ to allow easy reversion to a previous state. This
+ \alert{encourages tests/experimentation} in the analysis.
+ \item The \alert{Git commit hash} of the project source, is
+ \alert{printed} in the published paper and \alert{saved on
+ output} data products. Ensuring the
+ integrity/reproducibility of the result.
+ \item \colorbox{green!30!white}{These slides are available at
+ \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.}
+ \end{itemize}
+
+ \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt,
+ top=1pt, bottom=1pt]
+ For a technical description of Maneage's implementation, as well
+ as a checklist to customize it, and tips on good practices,
+ please see this page:
+
+ \textcolor{blue}{\footnotesize\url{https://gitlab.com/maneage/project/-/blob/maneage/README-hacking.md}}
+ \end{tcolorbox}
+ \end{frame}
\end{document}