First draft of short slides

A first draft of the summarized slides was written for the talk in the RDA Adoption week.
author: Mohammad Akhlaghi <mohammad@akhlaghi.org> 2020-06-17 05:16:58 +0100
committer: Mohammad Akhlaghi <mohammad@akhlaghi.org> 2020-06-17 05:16:58 +0100
commit: e543088567ce06626f5b94219d63bddafb1d8d6f (patch)
tree: 5b497ce257c91ebc04d86823b5e739850f9bd452
parent: 11696587813b77a79ae72da9b3ea01f602ceb231 (diff)
2 files changed, 913 insertions, 3 deletions
diff --git a/img/wds.jpg b/img/wds.jpg
new file mode 100644
index 0000000..032fb1d
--- /dev/null
+++ b/img/wds.jpg
diff --git a/slides-intro-short.tex b/slides-intro-short.tex
index b5e680a..88c14aa 100644
--- a/slides-intro-short.tex
+++ b/slides-intro-short.tex
@@ -34,11 +34,13 @@
     (\textcolor{blue}{\href{https://arxiv.org/abs/#1}{arXiv:#1}})}}
 
 %% Set the title
-\title{\huge\textbf{BIG} Data, \textbf{BIG} responsibility
-  \\ {\normalsize Introducing \emph{Maneage}: customizable framework for \emph{man}aging data lin\emph{eage}}}
+\title{{\huge\textbf{BIG} Data, \textbf{BIG} responsibility} \\
+  {\LARGE Towards Long-term and Archivable Reproducibility} \\
+  \vspace{2mm}{\small [RDA Europe Adoption grant recipient. Submitted to \href{https://www.computer.org/csdl/magazine/cs}{IEEE CiSE} (\textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}), Comments welcome]}
+}
 
 %% Set the author
-\author{\vspace{1cm}\\
+\author{\vspace{8mm}\\
   \href{https://akhlaghi.org}{Mohammad Akhlaghi}\\\vspace{0.5mm}
   \footnotesize
   Instituto de Astrof\'isica de Canarias ({\scriptsize IAC}), Tenerife, Spain
@@ -88,4 +90,912 @@
     \titlepage
   \end{frame}
 
+
+  \begin{frame}{Challenges of the RDA-WDS Publishing Data Workflows WG {\small (DOI:\href{https://doi.org/10.1007/s00799-016-0178-2}{10.1007/s00799-016-0178-2})}}
+    Challenges (also relevant to researchers, not just repositories)
+    \begin{itemize}
+    \item \emph{Bi-directional linking}: how to \alert{link data and publications}.
+    \item \emph{\alert{Software management}:} how to manage, preserve, publish and cite software?
+    \item \emph{Metrics:} \alert{how often} are data used.
+    \item \emph{Incentives to researchers:} how to \alert{communicate benefits} of following good practices \alert{to researchers}.
+    \end{itemize}
+
+    \begin{center}
+      \includegraphics[width=4cm]{img/rda.png}\hspace{1cm}
+      \includegraphics[width=4cm]{img/wds.jpg}
+    \end{center}
+
+    \pause
+    ``\emph{We would like to see a workflow that results in all
+      \textcolor{blue!30!green}{\bf scholarly objects being connected},
+      linked, citable, and persistent to allow researchers to navigate
+      smoothly and to \alert{\bf enable reproducible research}.  This
+      includes \alert{{\bf linkages} between documentation, code, data, and
+        journal articles in an integrated environment}. Furthermore,
+      in the ideal workflow, all of these objects need to be
+      \alert{\bf well documented} to enable other researchers (or
+      citizen scientists etc) to reuse the data for new
+      discoveries.}''
+  \end{frame}
+
+\usebackgroundtemplate{ }    %% undeclare it
+
+
+  \newcommand{\allopacity}{1}
+  \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+  \newcommand{\paperinit}{}
+  \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+  \newcommand{\sver}{}
+  \newcommand{\srep}{}
+  \newcommand{\dver}{}
+  \newcommand{\ddver}{}
+  \newcommand{\confopt}{}
+  \newcommand{\confenv}{}
+  \newcommand{\containers}{}
+  \newcommand{\db}{}
+  \newcommand{\calib}{}
+  \newcommand{\corr}{}
+  \newcommand{\runord}{}
+  \newcommand{\runopt}{}
+  \newcommand{\humanerr}{}
+  \newcommand{\confirmbias}{}
+  \newcommand{\depupdate}{}
+  \newcommand{\coauth}{}
+  \newcommand{\varsinpaper}{}
+  \newcommand{\recordinfo}{}
+  \newcommand{\softcite}{}
+  \newcommand{\prevchange}{}
+  \newcommand{\paperfinal}{}
+  \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+
+  %% Don't show the happy scientist or the existing containers box.
+  \let\paperinit\undefined
+  \let\allopacity\undefined
+  \let\paperfinal\undefined
+  \let\containers\undefined
+
+
+
+
+
+  \begin{frame}{Science is a tricky business}
+    \begin{center}
+      \includegraphics[width=0.9\linewidth]{img/nature-cartoon.jpg}
+    \end{center}
+
+    \vspace{-0.3cm}\hfill
+    {\tiny Image from nature.com
+      (``\href{https://www.nature.com/articles/d41586-017-07522-z}{Five
+        ways to fix statistics}'', Nov 2017)}\hspace{7mm}
+
+    \vspace{-1mm}
+    \begin{tcolorbox}[boxsep=0pt,left=1mm,right=1mm,top=1mm,bottom=1mm]
+      \small Data analysis [...] is a \alert{human
+        behavior}. Researchers who hunt hard enough will turn up a
+      result that fits statistical criteria, but their
+      \alert{discovery} will probably be a \alert{false positive}.
+
+      \hfill Five ways to fix statistics, Nature, 551, Nov 2017.
+    \end{tcolorbox}
+  \end{frame}
+
+
+
+
+
+  \begin{frame}{Founding criteria}
+    \begin{tcolorbox}[title=Basic/simple principle:]
+      \centering Science is defined by its METHOD, \alert{not} its
+      result.
+    \end{tcolorbox}
+
+    \pause
+    \begin{itemize}
+    \item \textbf{Complete/self-contained:}
+      \begin{itemize}
+      \item \alert{Only dependency} should be \alert{POSIX} tools \textcolor{gray}{(discards Conda or Jupyter which need Python)}.
+      \item Must \alert{not require root} permissions \textcolor{gray}{(discards tools like Docker or Nix/Guix)}.
+      \item Should be \alert{non-interactive} or runnable in batch (user interaction is an incompleteness).
+      \item Should be usable \alert{without internet} connection.
+      \end{itemize}
+
+      \pause
+    \item \textbf{Modularity:} Parts of the project should be \alert{re-usable} in other projects.
+      \pause
+    \item \textbf{Plain text:} Project's source should be in \alert{plain-text} \textcolor{gray}{(binary formats need special software)}
+      \begin{itemize}
+      \item This includes high-level analysis.
+      \item It is easily publishable (very low volume of $\times100$KB), archivable, and parse-able.
+      \item \alert{Version control} (e.g., with Git) can track project's history.
+      \end{itemize}
+      \pause
+    \item \textbf{Minimal complexity:} Occum’s rasor: “Never posit pluralities without necessity”.
+      \begin{itemize}
+      \item Avoiding the \alert{fashionable} tool of the day: tomorrow another tool will take its place!
+      \item Easier \alert{learning curve}, also doesn't create a \alert{generational gap}.
+      \item Is \alert{compatible} and \alert{extensible}.
+      \end{itemize}
+      \pause
+    \item \textbf{Verifable inputs and outputs:} Inputs and Outputs must be \alert{automatically verified}.
+      \pause
+    \item \textbf{Free and open source software:} \alert{Free software} is essential: non-free software is not configurable, not distributable, and dependent on non-free provider (which may discontinue it in N years).
+    \end{itemize}
+  \end{frame}
+
+
+
+  \newcommand{\focusonpackages}{}
+  \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+  \let\focusonpackages\undefined
+
+
+
+
+
+  \begin{frame}{Predefined/exact software tools}
+    \small
+    \begin{columns}
+      \column{10cm}
+      \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt,
+                        top=1pt, bottom=1pt, title=Reproducibility \&
+                        software]
+        \footnotesize Reproducing the environment (specific
+        \alert{software versions}, \alert{build instructions} and
+        \alert{dependencies}) is also critically important for
+        reproducibility.
+      \end{tcolorbox}
+
+      \vspace{2cm}
+
+      \begin{itemize}
+        \setlength\itemsep{0.6cm}
+      \item \emph{Containers} or \emph{Virtual Machines} are a
+        \alert{binary black box}.
+
+      \item Maneage \alert{installs fixed versions} of all
+        necessary research software and their dependencies.
+
+      \item Installs similar environment on \alert{GNU/Linux}, or
+        \alert{macOS} systems.
+
+      \item Works very much like a package manager (e.g.,
+        \alert{\texttt{apt}} or \alert{\texttt{brew}}).
+      \end{itemize}
+
+      \column{5cm}
+      \includegraphics[width=\linewidth]{img/version.png}
+    \end{columns}
+  \end{frame}
+
+
+
+
+
+  \begin{frame}{Predefined/exact software tools}
+    \small
+    \begin{columns}
+      \column{10cm}
+      \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt,
+                        top=1pt, bottom=1pt, title=Reproducibility \&
+                        software]
+        \footnotesize Reproducing the environment (specific
+        \alert{software versions}, \alert{build instructions} and
+        \alert{dependencies}) is also critically important for
+        reproducibility.
+      \end{tcolorbox}
+
+      \vspace{2cm}
+
+      \begin{itemize}
+        \setlength\itemsep{0.6cm}
+      \item \emph{Containers} or \emph{Virtual Machines} are a
+        \alert{binary black box}.
+
+      \item Maneage \alert{installs fixed versions} of all
+        necessary research software and their dependencies.
+
+      \item Installs similar environment on \alert{GNU/Linux}, or
+        \alert{macOS} systems.
+
+      \item Works very much like a package manager (e.g.,
+        \alert{\texttt{apt}} or \alert{\texttt{brew}}).
+      \end{itemize}
+
+      \column{5cm}
+      \includegraphics[width=\linewidth]{img/version-highlighted.png}
+    \end{columns}
+  \end{frame}
+
+
+
+
+
+  \begin{frame}{Controlled environment and build instructions}
+    \small
+    \begin{columns}
+      \column{5.5cm}
+      \includegraphics[width=0.9\linewidth]{img/env.png}
+      \column{5.5cm}
+      \includegraphics[width=0.9\linewidth]{img/build.png}
+    \end{columns}
+  \end{frame}
+
+  \begin{frame}{Controlled environment and build instructions}
+    \small
+    \begin{columns}
+      \column{5.5cm}
+      \includegraphics[width=0.9\linewidth]{img/env-highlighted.png}
+      \column{5.5cm}
+      \includegraphics[width=0.9\linewidth]{img/build-highlighted.png}
+    \end{columns}
+  \end{frame}
+
+
+
+
+
+  \begin{frame}{Example: Matplotlib (a Python visualization library) build dependencies}
+    \Wider[5em]{
+      %\vspace{5mm}
+      \begin{center}
+        \includegraphics[width=0.9\linewidth]{img/matplotlib.png}
+      \end{center}
+
+      \vspace{3mm}\tiny From ``Attributing and Referencing (Research)
+      Software: Best Practices and Outlook from Inria'' (Alliez et
+      al. 2019,
+      \textcolor{blue}{\href{https://hal.archives-ouvertes.fr/hal-02135891}{hal-02135891}})
+    }
+  \end{frame}
+
+
+
+
+
+
+
+  \begin{frame}{Advantages of this build system}
+    \begin{columns}
+      \column{11cm}
+      \begin{itemize}
+        \setlength\itemsep{0.7cm}
+      \item Project runs in fixed/controlled environment: custom build
+        of \alert{Bash}, \alert{Make}, GNU Coreutils
+        (\alert{\texttt{ls}}, \alert{\texttt{cp}},
+        \alert{\texttt{mkdir}} and etc), \alert{AWK}, or \alert{SED},
+        \alert{\LaTeX}, etc.
+      \item No need for \alert{root}/administrator \alert{permissions}
+        (on servers or super computers).
+      \item Whole system is built \alert{automatically} on any
+        Unix-like operating system \\(less 2 hours).
+      \item Dependencies of different projects will \alert{not conflict}.
+      \item Everything in \alert{plain text} (human \& computer
+        readable/archivable).
+      \end{itemize}
+      \column{4cm}
+      \includegraphics[width=\linewidth]{img/unchained.jpg}\\
+      \tiny \url{https://natemowry2.wordpress.com}
+    \end{columns}
+  \end{frame}
+
+
+  \begin{frame}{Software citation automatically generated in paper (including Astropy)}
+    \centering
+    \includegraphics[width=0.8\linewidth]{img/software-cite.jpg}
+  \end{frame}
+  \begin{frame}{Software citation automatically generated in paper (including Astropy)}
+    \centering
+    \includegraphics[width=0.8\linewidth]{img/software-cite-highlighted.jpg}
+  \end{frame}
+
+  %% Hardware/data
+  \newcommand{\focusonhardware}{}
+  \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+  \let\focusonhardware\undefined
+
+  \begin{frame}{Input data source and integrity is documented and checked}
+    \small
+    \begin{columns}
+      \column{10cm}
+      Stored information about each input file:
+      \begin{itemize}
+      \item \alert{PID} (where available).
+      \item Download \alert{URL}.
+      \item \alert{MD5}-sum to check integrity.
+      \end{itemize}
+
+      \vspace{0.75cm} All inputs are \alert{downloaded} from the given
+      PID/URL when necessary\\(during the analysis).
+
+      \vspace{0.75cm} MD5-sums are \alert{checked} to make sure the
+      download was done properly or the file is the same (hasn't
+      changed on the server/source).
+
+      \vspace{0.75cm}Example from the reproducible paper \textcolor{blue}{\href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230}}.\\
+      This paper needs three input files (two images, one catalog).
+
+      \column{5cm}
+      \includegraphics[width=\linewidth]{img/inputs.png}
+    \end{columns}
+  \end{frame}
+
+  \begin{frame}{Input data source and integrity is documented and checked}
+    \small
+    \begin{columns}
+      \column{10cm}
+      Stored information about each input file:
+      \begin{itemize}
+      \item \alert{PID} (where available).
+      \item Download \alert{URL}.
+      \item \alert{MD5}-sum to check integrity.
+      \end{itemize}
+
+      \vspace{0.75cm} All inputs are \alert{downloaded} from the given
+      PID/URL when necessary\\(during the analysis).
+
+      \vspace{0.75cm} MD5-sums are \alert{checked} to make sure the
+      download was done properly or the file is the same (hasn't
+      changed on the server/source).
+
+      \vspace{0.75cm}Example from the reproducible paper \textcolor{blue}{\href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230}}.\\
+      This paper needs three input files (two images, one catalog).
+
+      \column{5cm}
+      \includegraphics[width=\linewidth]{img/inputs-highlighted.png}
+    \end{columns}
+  \end{frame}
+
+
+
+
+
+
+
+
+
+
+
+  %% Analysis
+  \newcommand{\focusonrun}{}
+  \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+  \let\focusonrun\undefined
+
+
+
+
+
+  \begin{frame}{Reproducible science: Maneage is managed through a Makefile}
+    \small
+    \begin{columns}
+      \column{10cm}
+
+      All steps (downloading and analysis) are managed by Makefiles\\
+      (example from
+      \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}):
+
+      \vspace{5mm}
+      \begin{itemize}
+        \setlength\itemsep{0.7cm}
+      \item Unlike a script which always starts from the top, a
+        Makefile \alert{starts from the end} and steps that don't
+        change will be left untouched (not remade).
+      \item A single \emph{rule} can \alert{manage any number of
+        files}.
+      \item Make can identify independent steps internally and do them
+        in \alert{parallel}.
+      \item Make was \alert{designed for complex projects} with
+        thousands of files (all major Unix-like components), so it is
+        highly evolved and efficient.
+      \item Make is a very \alert{simple} and \alert{small} language,
+        thus easy to learn with great and free documentation (for
+        example
+        \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU
+            Make's manual}}).
+      \end{itemize}
+
+      \column{5cm}
+      \includegraphics[width=\linewidth]{img/reproducible-makefile.png}
+    \end{columns}
+  \end{frame}
+  \begin{frame}{Reproducible science: Maneage is managed through a Makefile}
+    \small
+    \begin{columns}
+      \column{10cm}
+
+      All steps (downloading and analysis) are managed by Makefiles\\
+      (example from
+      \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}):
+
+      \vspace{5mm}
+      \begin{itemize}
+        \setlength\itemsep{0.7cm}
+      \item Unlike a script which always starts from the top, a
+        Makefile \alert{starts from the end} and steps that don't
+        change will be left untouched (not remade).
+      \item A single \emph{rule} can \alert{manage any number of
+        files}.
+      \item Make can identify independent steps internally and do them
+        in \alert{parallel}.
+      \item Make was \alert{designed for complex projects} with
+        thousands of files (all major Unix-like components), so it is
+        highly evolved and efficient.
+      \item Make is a very \alert{simple} and \alert{small} language,
+        thus easy to learn with great and free documentation (for
+        example
+        \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU
+            Make's manual}}).
+      \end{itemize}
+
+      \column{5cm}
+      \includegraphics[width=\linewidth]{img/reproducible-makefile-highlighted-1.png}
+    \end{columns}
+  \end{frame}
+  \begin{frame}{Reproducible science: Maneage is managed through a Makefile}
+    \small
+    \begin{columns}
+      \column{10cm}
+
+      All steps (downloading and analysis) are managed by Makefiles\\
+      (example from
+      \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}):
+
+      \vspace{5mm}
+      \begin{itemize}
+        \setlength\itemsep{0.7cm}
+      \item Unlike a script which always starts from the top, a
+        Makefile \alert{starts from the end} and steps that don't
+        change will be left untouched (not remade).
+      \item A single \emph{rule} can \alert{manage any number of
+        files}.
+      \item Make can identify independent steps internally and do them
+        in \alert{parallel}.
+      \item Make was \alert{designed for complex projects} with
+        thousands of files (all major Unix-like components), so it is
+        highly evolved and efficient.
+      \item Make is a very \alert{simple} and \alert{small} language,
+        thus easy to learn with great and free documentation (for
+        example
+        \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU
+            Make's manual}}).
+      \end{itemize}
+
+      \column{5cm}
+      \includegraphics[width=\linewidth]{img/reproducible-makefile-highlighted-2.png}
+    \end{columns}
+  \end{frame}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  \newcommand{\focusonpaper}{}
+  \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+  \let\focusonpaper\undefined
+
+
+  \begin{frame}{Values in final report/paper}
+    All analysis \alert{results} (numbers, plots, tables) written in
+    paper's PDF as \alert{\LaTeX{} macros}. They are thus
+    \alert{updated automatically} on any change.\\ Shown here is a
+    portion of the \textsf{NoiseChisel} paper and its \LaTeX{} source
+    (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}).
+
+    \vspace{0.4cm}
+    \includegraphics[width=\linewidth]{img/reproducible-latex.png}
+  \end{frame}
+
+  \begin{frame}{Values in final report/paper}
+    All analysis \alert{results} (numbers, plots, tables) written in
+    paper's PDF as \alert{\LaTeX{} macros}. They are thus
+    \alert{updated automatically} on any change.\\ Shown here is a
+    portion of the \textsf{NoiseChisel} paper and its \LaTeX{} source
+    (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}).
+
+    \vspace{0.4cm}
+    \includegraphics[width=\linewidth]{img/reproducible-latex-highlighted.png}
+  \end{frame}
+
+
+
+
+
+  \begin{frame}{Analysis step results/values concatenated into a single file.}
+    All \LaTeX{} macros come from a \alert{single file}.
+    \begin{center}
+      \includegraphics[width=0.6\linewidth]{img/reproducible-macros.png}
+    \end{center}
+  \end{frame}
+  \begin{frame}{Analysis step results/values concatenated into a single file.}
+    All \LaTeX{} macros come from a \alert{single file}.
+    \begin{center}
+      \includegraphics[width=0.6\linewidth]{img/reproducible-macros-highlighted.png}
+    \end{center}
+  \end{frame}
+
+
+
+
+
+
+
+  \begin{frame}{Analysis results stored as \LaTeX{} macros}
+    The analysis scripts write/update the \LaTeX{} macro values
+    automatically.
+    \begin{center}
+      \includegraphics[width=0.6\linewidth]{img/reproducible-write-macro.png}
+    \end{center}
+  \end{frame}
+  \begin{frame}{Analysis results stored as \LaTeX{} macros}
+    The analysis scripts write/update the \LaTeX{} macro values
+    automatically.
+    \begin{center}
+      \includegraphics[width=0.6\linewidth]{img/reproducible-write-macro-highlight.png}
+    \end{center}
+  \end{frame}
+
+
+  %% Make demo.
+  \begin{frame}
+    \LARGE
+    \vspace{1cm}
+    \hfill Let's see how the analysis is managed in a hypothetical project...
+  \end{frame}
+  \makedemoslide{img/data-lineage-1.pdf}
+                {Makefiles (\texttt{\*.mk}) keep contextually separate parts of the project, all imported into \texttt{top-make.mk}}
+  \makedemoslide{img/data-lineage-2.pdf}
+                {The ultimate purpose of the project is to produce a paper/report (in PDF).}
+  \makedemoslide{img/data-lineage-3.pdf}
+                {The narrative description, typography and references are in \texttt{paper.tex} \& \texttt{references.tex}.}
+  \makedemoslide{img/data-lineage-4.pdf}
+                {Analysis outputs (blended into the PDF as \LaTeX{} macros) come from \texttt{project.tex}.}
+  \makedemoslide{img/data-lineage-5.pdf}
+                {But analysis outputs must first be \emph{verified} (with checksums) before entering the report/paper.}
+  \makedemoslide{img/data-lineage-6.pdf}
+                {Basic project info comes from \texttt{initialize.tex}.}
+  \makedemoslide{img/data-lineage-7.pdf}
+                {Reported values about the downloaded inputs come from \texttt{download.tex}.}
+  \makedemoslide{img/data-lineage-8.pdf}
+                {... for example the number of rows in the second input (a catalog) of the project.}
+  \makedemoslide{img/data-lineage-9.pdf}
+                {The URL to download \texttt{input2.dat}, and a checksum to validate it, are stored in \texttt{INPUTS.conf}.}
+  \makedemoslide{img/data-lineage-10.pdf}
+                {Reported values from first analysis steps stored in \texttt{analysis1.tex}.}
+  \makedemoslide{img/data-lineage-11.pdf}
+                {... for example the average of the numbers in \texttt{out-1b.dat}.}
+  \makedemoslide{img/data-lineage-12.pdf}
+                {But \texttt{out-1b.dat} itself depends on other files and a paramter (for example a multiple of sigma).}
+  \makedemoslide{img/data-lineage-13.pdf}
+                {\texttt{out-1a.dat} is built from a downloaded dataset.}
+  \makedemoslide{img/data-lineage-14.pdf}
+                {Download URL and checksum of \texttt{input1.dat} also stored in \texttt{INPUTS.conf}.}
+  \makedemoslide{img/data-lineage-15.pdf}
+                {Reported values from second analysis steps stored in \texttt{analysis2.tex}.}
+  \makedemoslide{img/data-lineage-16.pdf}
+                {... for example the number of selected rows in \texttt{out-2b.dat}.}
+  \makedemoslide{img/data-lineage-17.pdf}
+                {\texttt{out-2b.dat} is derived from \texttt{out-1b.dat} (for example, rejected some of \texttt{out-1b.dat}'s rows).}
+  \makedemoslide{img/data-lineage-18.pdf}
+                {Reported values from third analysis steps stored in \texttt{analysis3.tex}.}
+  \makedemoslide{img/data-lineage-19.pdf}
+                {... for example measurements from both \texttt{out-3a.dat} and \texttt{out-3b.dat}.}
+  \makedemoslide{img/data-lineage-20.pdf}
+                {\texttt{out-3b.dat} is generated from an analysis on \texttt{out-2a.dat}.}
+  \makedemoslide{img/data-lineage-21.pdf}
+                {But \texttt{out-2a.dat} itself is generated from \texttt{input1.dat} and an analysis which has two settings.}
+  \makedemoslide{img/data-lineage-22.pdf}
+                {\texttt{out-3a.dat} also depends on \texttt{out-1a.dat} and an analysis with needs one parameter.}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  \newcommand{\allopacity}{1}
+  \begin{frame}{All questions have an answer now (in
+        \alert{plain text}: human \& computer readable/archivable).}
+    \include{tex/project-graph} \end{frame}
+  \newcommand{\gitlogo}{}
+  \begin{frame}{All questions have an answer now (in
+        \alert{plain text}: so we can use Git to keep its history).}
+    \include{tex/project-graph}
+  \end{frame}
+
+
+
+
+
+  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+  \newcommand{\projinit}{}
+  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+  \newcommand{\projwork}{}
+  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+  \newcommand{\tempevolve}{}
+  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+  \newcommand{\mergewithtemp}{}
+  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+  \newcommand{\tofuture}{}
+  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+  \newcommand{\githappy}{}
+  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+  \newcommand{\gitverified}{}
+  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+
+  \begin{frame}{Two recent examples (publishing Git checksum in abstract)}
+    \begin{columns}
+      \column{0.5\linewidth}
+      \centering
+      \includegraphics[width=0.8\linewidth]{img/firstpage-190911230.png}
+      \column{0.5\linewidth}
+      \centering
+      \includegraphics[width=0.8\linewidth]{img/firstpage-mnras491.png}
+    \end{columns}
+  \end{frame}
+
+  \begin{frame}{Two recent examples (publishing Git checksum in abstract)}
+    \begin{columns}
+      \column{0.5\linewidth}
+      \centering
+      \includegraphics[width=0.8\linewidth]{img/firstpage-190911230-highlighted.png}
+      \column{0.5\linewidth}
+      \centering
+      \includegraphics[width=0.8\linewidth]{img/firstpage-mnras491-highlighted.png}
+    \end{columns}
+  \end{frame}
+
+
+
+
+
+  \begin{frame}{Publication of the project}
+
+    A reproducible project using Maneage will have the following
+    (\alert{plain text}) components:
+    \begin{itemize}
+    \item Makefiles.
+    \item \LaTeX{} source files.
+    \item Configuration files for software used in analysis.
+    \item Scripts/programming files (e.g., Python, Shell, AWK, C).
+    \end{itemize}
+    The \alert{volume} of the project's source will thus be
+    \alert{negligible} compared to a single figure in a paper
+    (usually $\sim100$ kilo-bytes).
+
+    \vspace{1cm} The project's pipeline (customized Maneage) can be
+    \alert{published} in
+    \begin{itemize}
+    \item \alert{arXiv}: uploaded with the \LaTeX{} source to always
+      stay with the paper \\(for example
+      \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). The
+      file containing all macros must also be uploaded so arXiv's
+      server can easily build the \LaTeX{} source.
+    \item \alert{Zenodo}: Along with all the input datasets (many
+      Gigabytes) and software \\(for example
+      \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.3408481}{zenodo.3408481}}) and given a unique DOI.
+    \end{itemize}
+  \end{frame}
+
+
+
+
+
+  \begin{frame}{Project source and its execution}
+    \begin{tcolorbox}
+      Programs \textcolor{gray}{[here: Scientific projects]} must be
+      written for \alert{people to read}...
+
+      \hfill ...and only \emph{incidentally} for machines to
+      \emph{execute}.
+
+      \vspace{2mm}
+      \hfill \footnotesize Harold Abelson, Structure and Interpretation of Computer Programs
+    \end{tcolorbox}
+  \end{frame}
+
+
+
+
+
+  \begin{frame}[t]{General outline of using this system (for example \href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230})}
+    \vspace{1cm}
+    \texttt{\$ git clone http://gitlab.com/makhlaghi/iau-symposium-355{ }{ }{ }{ }\textcolor{gray}{\# Import the project.}}\\
+
+    \pause
+    \vspace{1.5cm}
+    \texttt{\$ ./project configure { }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# You will specify the build directory on your system,}}\\
+    \texttt{{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# and it will build all software (about 1.5 hours).}}
+
+    \pause
+    \vspace{1.5cm}
+    \texttt{\$ ./project make { }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# Does all the analysis and makes final PDF.}}\\
+  \end{frame}
+
+
+
+
+
+
+    \begin{frame}{Future prospects...}
+    \large Adoption of reproducibility by many researchers will enable
+    the following:
+
+    \vspace{1em}
+    \begin{itemize}
+      \setlength\itemsep{3mm}
+    \item A repository for education/training \textcolor{gray}{(PhD
+      students, or researchers in other fields)}.
+    \item Easy \alert{verification}/\alert{understanding} of other
+      research projects \textcolor{gray}{(when necessary)}.
+    \item Trivially \alert{test} different steps of others' work
+      \textcolor{gray}{(different configurations, software and etc)}.
+    \item Science can progress \alert{incrementally}
+      \textcolor{gray}{(shorter papers actually building on each
+        other!)}.
+    \item \alert{Extract meta-data} after the publication of a dataset
+      \textcolor{gray}{(for future ontologies or vocabularies)}.
+    \item Applying \alert{machine learning} on reproducible research
+      projects will allow us to solve some Big Data Challenges:
+
+      \vspace{1em}
+      \begin{itemize}
+        \setlength\itemsep{2mm}
+      \item \emph{Extract the relevant parameters automatically}.
+      \item \emph{Translate the science to enormous samples}.
+      \item \emph{Believe the results when no one will have time to
+        reproduce}.
+      \item \emph{Have confidence in results derived using machine
+        learning or AI}.
+      \end{itemize}
+    \end{itemize}
+  \end{frame}
+
+\begin{frame}{Existing technologies (Independent environment)}
+  \begin{itemize}
+    \setlength\itemsep{7mm}
+  \item \textbf{Virtual machines:}
+    \begin{itemize}
+      \setlength\itemsep{3mm}
+    \item Contain the \alert{full operating system}, are thus very large ($\times$Gigabytes).
+    \item In \emph{binary} format (decoding a built VM's environment is extremely hard and inaccurate).
+    \end{itemize}
+  \item \textbf{Containers:} (For example Docker or Singularity)
+    \begin{itemize}
+      \setlength\itemsep{3mm}
+    \item Similar to virtual machines, but \alert{without low-level kernel} (use host's kernel).
+    \item \alert{Will fail} as soon as kernel is no longer supported\\(for example Docker currently only supports Linux kernel 3.10 and above \alert{from 2013}).
+    \item Good solutions for software engineers (that need to \emph{reproduce a bug's environment today}).
+    \item Docker is modular, needs root previlages (not available in HPCs), Dockerfiles allow incompleteness\\(especially in the common scenario of using the operating system's package manager, see next slide)
+    \item Singularity is monolithic and thus can be very large.
+    \item In \alert{binary} format (similar to VMs, especially when OS package managers are used).
+    \end{itemize}
+  \end{itemize}
+
+  \vspace{3mm}
+In summary, they only \alert{store a built} environment (they are outputs, not good for archiving).
+
+\end{frame}
+
+
+
+
+
+\begin{frame}{Existing technologies (Package managers)}
+
+  \begin{itemize}
+  \item \textbf{Operating system package managers:}
+    \begin{itemize}
+      \setlength\itemsep{2mm}
+    \item For example \texttt{apt} or \texttt{yum} for Debian-based and RedHat-based GNU/Linux operating systems\\(the most common way to install software).
+    \item Tightly intertwined with the operating system's components\\(arbitrary control of software versions is not easily possible).
+    \item Older software (for example +5 years) is usually removed.
+    \end{itemize}
+  \item \textbf{Conda/Anaconda:}
+    \begin{itemize}
+      \setlength\itemsep{2mm}
+    \item Conda has build instructions for software and their dependencies.
+    \item But it doesn't go down to the C library or the lower-level components of operating system.
+    \item It is written in Python (can't be used later when current Python is depreciated).
+    \item Authors of Uhse+2019\footnote{\url{http://dx.doi.org/10.1002/cppb.20097}} report\footnote{\url{https://github.com/conda-forge/conda-forge.github.io/issues/787}} that their Conda environment breaks roughly every 3 months\\(Conda environments need to be updated to be used later! Breaking reproducibility).
+    \end{itemize}
+  \item \textbf{Nix, or GNU Guix:}
+    \begin{itemize}
+      \setlength\itemsep{2mm}
+    \item Deliver perfectly reproducible builds (bit-wise reproducibility of software), needs root access.
+    \item Doesn't \emph{require} documentation of dependencies.
+    \end{itemize}
+  \item \textbf{Spack:} Similar to Nix/Guix but written in Python.
+  \end{itemize}
+\end{frame}
+
+\begin{frame}{Existing technologies (workflow tools)}
+  \begin{itemize}
+    \setlength\itemsep{4mm}
+  \item \textbf{Binder:} (\url{https://mybinder.org}) Docker+Conda.
+  \item \textbf{Galaxy:} (\url{https://galaxyproject.org}) A web-based user interface, primarily designed for genomics. The GUI make it hard to automate, and has too many dependencies. Very similar to GenePattern (2008 to 2017): with +40,000 users and $\sim4000$ jobs running per week, but cut due to funding.
+  \item \textbf{Sciunit:} (\url{https://sciunit.run}) Parses program binaries to try to infer their dependencies and copy them.
+  \item \textbf{Popper:} (\url{https://falsifiable.us}), HCL (previously used by GitHub Actions) + Conda + Docker.
+  \item \textbf{WholeTale:} (\url{https://wholetale.org}) Jupyter + Conda + Docker.
+  \item \textbf{Image Processing On Line (IPOL) journal:} The best example of publishing algorithms/methods I have seen, only useful for very basic/low-level software.
+  \end{itemize}
+  \alert{Summary}: except for IPOL, most solutions surveyed have far too many dependencies to be usable \alert{beyond the immediate future}.
+\end{frame}
+
+
+
+
+  \begin{frame}{Summary:}
+
+    Maneage is introduced as a customizable template that will do the
+    following steps/instructions (all in simple plain text files).
+    \begin{itemize}
+      \item \alert{Automatically downloads} the necessary
+        \emph{software} and \emph{data}.
+      \item \alert{Builds} the software in a \alert{closed
+        environment}.
+      \item Runs the software on data to \alert{generate} the final
+        \alert{research results}.
+      \item A modification in one part of the analysis will only
+        result in re-doing that part, not the whole project.
+      \item Using LaTeX macros, paper's figures, tables and numbers
+        will be \alert{Automatically updated} after a change in
+        analysis. Allowing the scientist to focus on the scientific
+        interpretation.
+      \item The whole project is under \alert{version control} (Git)
+        to allow easy reversion to a previous state. This
+        \alert{encourages tests/experimentation} in the analysis.
+      \item The \alert{Git commit hash} of the project source, is
+        \alert{printed} in the published paper and \alert{saved on
+          output} data products.  Ensuring the
+        integrity/reproducibility of the result.
+      \item \colorbox{green!30!white}{These slides are available at
+        \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.}
+    \end{itemize}
+
+    \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt,
+                      top=1pt, bottom=1pt]
+      For a technical description of Maneage's implementation, as well
+      as a checklist to customize it, and tips on good practices,
+      please see this page:
+
+    \textcolor{blue}{\footnotesize\url{https://gitlab.com/maneage/project/-/blob/maneage/README-hacking.md}}
+    \end{tcolorbox}
+  \end{frame}
 \end{document}
author	Mohammad Akhlaghi <mohammad@akhlaghi.org>	2020-06-17 05:16:58 +0100
committer	Mohammad Akhlaghi <mohammad@akhlaghi.org>	2020-06-17 05:16:58 +0100
commit	e543088567ce06626f5b94219d63bddafb1d8d6f (patch)
tree	5b497ce257c91ebc04d86823b5e739850f9bd452
parent	11696587813b77a79ae72da9b3ea01f602ceb231 (diff)