From e543088567ce06626f5b94219d63bddafb1d8d6f Mon Sep 17 00:00:00 2001 From: Mohammad Akhlaghi Date: Wed, 17 Jun 2020 05:16:58 +0100 Subject: First draft of short slides A first draft of the summarized slides was written for the talk in the RDA Adoption week. --- img/wds.jpg | Bin 0 -> 67599 bytes slides-intro-short.tex | 916 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 913 insertions(+), 3 deletions(-) create mode 100644 img/wds.jpg diff --git a/img/wds.jpg b/img/wds.jpg new file mode 100644 index 0000000..032fb1d Binary files /dev/null and b/img/wds.jpg differ diff --git a/slides-intro-short.tex b/slides-intro-short.tex index b5e680a..88c14aa 100644 --- a/slides-intro-short.tex +++ b/slides-intro-short.tex @@ -34,11 +34,13 @@ (\textcolor{blue}{\href{https://arxiv.org/abs/#1}{arXiv:#1}})}} %% Set the title -\title{\huge\textbf{BIG} Data, \textbf{BIG} responsibility - \\ {\normalsize Introducing \emph{Maneage}: customizable framework for \emph{man}aging data lin\emph{eage}}} +\title{{\huge\textbf{BIG} Data, \textbf{BIG} responsibility} \\ + {\LARGE Towards Long-term and Archivable Reproducibility} \\ + \vspace{2mm}{\small [RDA Europe Adoption grant recipient. Submitted to \href{https://www.computer.org/csdl/magazine/cs}{IEEE CiSE} (\textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}), Comments welcome]} +} %% Set the author -\author{\vspace{1cm}\\ +\author{\vspace{8mm}\\ \href{https://akhlaghi.org}{Mohammad Akhlaghi}\\\vspace{0.5mm} \footnotesize Instituto de Astrof\'isica de Canarias ({\scriptsize IAC}), Tenerife, Spain @@ -88,4 +90,912 @@ \titlepage \end{frame} + + \begin{frame}{Challenges of the RDA-WDS Publishing Data Workflows WG {\small (DOI:\href{https://doi.org/10.1007/s00799-016-0178-2}{10.1007/s00799-016-0178-2})}} + Challenges (also relevant to researchers, not just repositories) + \begin{itemize} + \item \emph{Bi-directional linking}: how to \alert{link data and publications}. + \item \emph{\alert{Software management}:} how to manage, preserve, publish and cite software? + \item \emph{Metrics:} \alert{how often} are data used. + \item \emph{Incentives to researchers:} how to \alert{communicate benefits} of following good practices \alert{to researchers}. + \end{itemize} + + \begin{center} + \includegraphics[width=4cm]{img/rda.png}\hspace{1cm} + \includegraphics[width=4cm]{img/wds.jpg} + \end{center} + + \pause + ``\emph{We would like to see a workflow that results in all + \textcolor{blue!30!green}{\bf scholarly objects being connected}, + linked, citable, and persistent to allow researchers to navigate + smoothly and to \alert{\bf enable reproducible research}. This + includes \alert{{\bf linkages} between documentation, code, data, and + journal articles in an integrated environment}. Furthermore, + in the ideal workflow, all of these objects need to be + \alert{\bf well documented} to enable other researchers (or + citizen scientists etc) to reuse the data for new + discoveries.}'' + \end{frame} + +\usebackgroundtemplate{ } %% undeclare it + + + \newcommand{\allopacity}{1} + \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} + \newcommand{\paperinit}{} + \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} + \newcommand{\sver}{} + \newcommand{\srep}{} + \newcommand{\dver}{} + \newcommand{\ddver}{} + \newcommand{\confopt}{} + \newcommand{\confenv}{} + \newcommand{\containers}{} + \newcommand{\db}{} + \newcommand{\calib}{} + \newcommand{\corr}{} + \newcommand{\runord}{} + \newcommand{\runopt}{} + \newcommand{\humanerr}{} + \newcommand{\confirmbias}{} + \newcommand{\depupdate}{} + \newcommand{\coauth}{} + \newcommand{\varsinpaper}{} + \newcommand{\recordinfo}{} + \newcommand{\softcite}{} + \newcommand{\prevchange}{} + \newcommand{\paperfinal}{} + \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} + + %% Don't show the happy scientist or the existing containers box. + \let\paperinit\undefined + \let\allopacity\undefined + \let\paperfinal\undefined + \let\containers\undefined + + + + + + \begin{frame}{Science is a tricky business} + \begin{center} + \includegraphics[width=0.9\linewidth]{img/nature-cartoon.jpg} + \end{center} + + \vspace{-0.3cm}\hfill + {\tiny Image from nature.com + (``\href{https://www.nature.com/articles/d41586-017-07522-z}{Five + ways to fix statistics}'', Nov 2017)}\hspace{7mm} + + \vspace{-1mm} + \begin{tcolorbox}[boxsep=0pt,left=1mm,right=1mm,top=1mm,bottom=1mm] + \small Data analysis [...] is a \alert{human + behavior}. Researchers who hunt hard enough will turn up a + result that fits statistical criteria, but their + \alert{discovery} will probably be a \alert{false positive}. + + \hfill Five ways to fix statistics, Nature, 551, Nov 2017. + \end{tcolorbox} + \end{frame} + + + + + + \begin{frame}{Founding criteria} + \begin{tcolorbox}[title=Basic/simple principle:] + \centering Science is defined by its METHOD, \alert{not} its + result. + \end{tcolorbox} + + \pause + \begin{itemize} + \item \textbf{Complete/self-contained:} + \begin{itemize} + \item \alert{Only dependency} should be \alert{POSIX} tools \textcolor{gray}{(discards Conda or Jupyter which need Python)}. + \item Must \alert{not require root} permissions \textcolor{gray}{(discards tools like Docker or Nix/Guix)}. + \item Should be \alert{non-interactive} or runnable in batch (user interaction is an incompleteness). + \item Should be usable \alert{without internet} connection. + \end{itemize} + + \pause + \item \textbf{Modularity:} Parts of the project should be \alert{re-usable} in other projects. + \pause + \item \textbf{Plain text:} Project's source should be in \alert{plain-text} \textcolor{gray}{(binary formats need special software)} + \begin{itemize} + \item This includes high-level analysis. + \item It is easily publishable (very low volume of $\times100$KB), archivable, and parse-able. + \item \alert{Version control} (e.g., with Git) can track project's history. + \end{itemize} + \pause + \item \textbf{Minimal complexity:} Occum’s rasor: “Never posit pluralities without necessity”. + \begin{itemize} + \item Avoiding the \alert{fashionable} tool of the day: tomorrow another tool will take its place! + \item Easier \alert{learning curve}, also doesn't create a \alert{generational gap}. + \item Is \alert{compatible} and \alert{extensible}. + \end{itemize} + \pause + \item \textbf{Verifable inputs and outputs:} Inputs and Outputs must be \alert{automatically verified}. + \pause + \item \textbf{Free and open source software:} \alert{Free software} is essential: non-free software is not configurable, not distributable, and dependent on non-free provider (which may discontinue it in N years). + \end{itemize} + \end{frame} + + + + \newcommand{\focusonpackages}{} + \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} + \let\focusonpackages\undefined + + + + + + \begin{frame}{Predefined/exact software tools} + \small + \begin{columns} + \column{10cm} + \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, + top=1pt, bottom=1pt, title=Reproducibility \& + software] + \footnotesize Reproducing the environment (specific + \alert{software versions}, \alert{build instructions} and + \alert{dependencies}) is also critically important for + reproducibility. + \end{tcolorbox} + + \vspace{2cm} + + \begin{itemize} + \setlength\itemsep{0.6cm} + \item \emph{Containers} or \emph{Virtual Machines} are a + \alert{binary black box}. + + \item Maneage \alert{installs fixed versions} of all + necessary research software and their dependencies. + + \item Installs similar environment on \alert{GNU/Linux}, or + \alert{macOS} systems. + + \item Works very much like a package manager (e.g., + \alert{\texttt{apt}} or \alert{\texttt{brew}}). + \end{itemize} + + \column{5cm} + \includegraphics[width=\linewidth]{img/version.png} + \end{columns} + \end{frame} + + + + + + \begin{frame}{Predefined/exact software tools} + \small + \begin{columns} + \column{10cm} + \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, + top=1pt, bottom=1pt, title=Reproducibility \& + software] + \footnotesize Reproducing the environment (specific + \alert{software versions}, \alert{build instructions} and + \alert{dependencies}) is also critically important for + reproducibility. + \end{tcolorbox} + + \vspace{2cm} + + \begin{itemize} + \setlength\itemsep{0.6cm} + \item \emph{Containers} or \emph{Virtual Machines} are a + \alert{binary black box}. + + \item Maneage \alert{installs fixed versions} of all + necessary research software and their dependencies. + + \item Installs similar environment on \alert{GNU/Linux}, or + \alert{macOS} systems. + + \item Works very much like a package manager (e.g., + \alert{\texttt{apt}} or \alert{\texttt{brew}}). + \end{itemize} + + \column{5cm} + \includegraphics[width=\linewidth]{img/version-highlighted.png} + \end{columns} + \end{frame} + + + + + + \begin{frame}{Controlled environment and build instructions} + \small + \begin{columns} + \column{5.5cm} + \includegraphics[width=0.9\linewidth]{img/env.png} + \column{5.5cm} + \includegraphics[width=0.9\linewidth]{img/build.png} + \end{columns} + \end{frame} + + \begin{frame}{Controlled environment and build instructions} + \small + \begin{columns} + \column{5.5cm} + \includegraphics[width=0.9\linewidth]{img/env-highlighted.png} + \column{5.5cm} + \includegraphics[width=0.9\linewidth]{img/build-highlighted.png} + \end{columns} + \end{frame} + + + + + + \begin{frame}{Example: Matplotlib (a Python visualization library) build dependencies} + \Wider[5em]{ + %\vspace{5mm} + \begin{center} + \includegraphics[width=0.9\linewidth]{img/matplotlib.png} + \end{center} + + \vspace{3mm}\tiny From ``Attributing and Referencing (Research) + Software: Best Practices and Outlook from Inria'' (Alliez et + al. 2019, + \textcolor{blue}{\href{https://hal.archives-ouvertes.fr/hal-02135891}{hal-02135891}}) + } + \end{frame} + + + + + + + + \begin{frame}{Advantages of this build system} + \begin{columns} + \column{11cm} + \begin{itemize} + \setlength\itemsep{0.7cm} + \item Project runs in fixed/controlled environment: custom build + of \alert{Bash}, \alert{Make}, GNU Coreutils + (\alert{\texttt{ls}}, \alert{\texttt{cp}}, + \alert{\texttt{mkdir}} and etc), \alert{AWK}, or \alert{SED}, + \alert{\LaTeX}, etc. + \item No need for \alert{root}/administrator \alert{permissions} + (on servers or super computers). + \item Whole system is built \alert{automatically} on any + Unix-like operating system \\(less 2 hours). + \item Dependencies of different projects will \alert{not conflict}. + \item Everything in \alert{plain text} (human \& computer + readable/archivable). + \end{itemize} + \column{4cm} + \includegraphics[width=\linewidth]{img/unchained.jpg}\\ + \tiny \url{https://natemowry2.wordpress.com} + \end{columns} + \end{frame} + + + \begin{frame}{Software citation automatically generated in paper (including Astropy)} + \centering + \includegraphics[width=0.8\linewidth]{img/software-cite.jpg} + \end{frame} + \begin{frame}{Software citation automatically generated in paper (including Astropy)} + \centering + \includegraphics[width=0.8\linewidth]{img/software-cite-highlighted.jpg} + \end{frame} + + %% Hardware/data + \newcommand{\focusonhardware}{} + \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} + \let\focusonhardware\undefined + + \begin{frame}{Input data source and integrity is documented and checked} + \small + \begin{columns} + \column{10cm} + Stored information about each input file: + \begin{itemize} + \item \alert{PID} (where available). + \item Download \alert{URL}. + \item \alert{MD5}-sum to check integrity. + \end{itemize} + + \vspace{0.75cm} All inputs are \alert{downloaded} from the given + PID/URL when necessary\\(during the analysis). + + \vspace{0.75cm} MD5-sums are \alert{checked} to make sure the + download was done properly or the file is the same (hasn't + changed on the server/source). + + \vspace{0.75cm}Example from the reproducible paper \textcolor{blue}{\href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230}}.\\ + This paper needs three input files (two images, one catalog). + + \column{5cm} + \includegraphics[width=\linewidth]{img/inputs.png} + \end{columns} + \end{frame} + + \begin{frame}{Input data source and integrity is documented and checked} + \small + \begin{columns} + \column{10cm} + Stored information about each input file: + \begin{itemize} + \item \alert{PID} (where available). + \item Download \alert{URL}. + \item \alert{MD5}-sum to check integrity. + \end{itemize} + + \vspace{0.75cm} All inputs are \alert{downloaded} from the given + PID/URL when necessary\\(during the analysis). + + \vspace{0.75cm} MD5-sums are \alert{checked} to make sure the + download was done properly or the file is the same (hasn't + changed on the server/source). + + \vspace{0.75cm}Example from the reproducible paper \textcolor{blue}{\href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230}}.\\ + This paper needs three input files (two images, one catalog). + + \column{5cm} + \includegraphics[width=\linewidth]{img/inputs-highlighted.png} + \end{columns} + \end{frame} + + + + + + + + + + + + %% Analysis + \newcommand{\focusonrun}{} + \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} + \let\focusonrun\undefined + + + + + + \begin{frame}{Reproducible science: Maneage is managed through a Makefile} + \small + \begin{columns} + \column{10cm} + + All steps (downloading and analysis) are managed by Makefiles\\ + (example from + \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}): + + \vspace{5mm} + \begin{itemize} + \setlength\itemsep{0.7cm} + \item Unlike a script which always starts from the top, a + Makefile \alert{starts from the end} and steps that don't + change will be left untouched (not remade). + \item A single \emph{rule} can \alert{manage any number of + files}. + \item Make can identify independent steps internally and do them + in \alert{parallel}. + \item Make was \alert{designed for complex projects} with + thousands of files (all major Unix-like components), so it is + highly evolved and efficient. + \item Make is a very \alert{simple} and \alert{small} language, + thus easy to learn with great and free documentation (for + example + \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU + Make's manual}}). + \end{itemize} + + \column{5cm} + \includegraphics[width=\linewidth]{img/reproducible-makefile.png} + \end{columns} + \end{frame} + \begin{frame}{Reproducible science: Maneage is managed through a Makefile} + \small + \begin{columns} + \column{10cm} + + All steps (downloading and analysis) are managed by Makefiles\\ + (example from + \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}): + + \vspace{5mm} + \begin{itemize} + \setlength\itemsep{0.7cm} + \item Unlike a script which always starts from the top, a + Makefile \alert{starts from the end} and steps that don't + change will be left untouched (not remade). + \item A single \emph{rule} can \alert{manage any number of + files}. + \item Make can identify independent steps internally and do them + in \alert{parallel}. + \item Make was \alert{designed for complex projects} with + thousands of files (all major Unix-like components), so it is + highly evolved and efficient. + \item Make is a very \alert{simple} and \alert{small} language, + thus easy to learn with great and free documentation (for + example + \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU + Make's manual}}). + \end{itemize} + + \column{5cm} + \includegraphics[width=\linewidth]{img/reproducible-makefile-highlighted-1.png} + \end{columns} + \end{frame} + \begin{frame}{Reproducible science: Maneage is managed through a Makefile} + \small + \begin{columns} + \column{10cm} + + All steps (downloading and analysis) are managed by Makefiles\\ + (example from + \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}): + + \vspace{5mm} + \begin{itemize} + \setlength\itemsep{0.7cm} + \item Unlike a script which always starts from the top, a + Makefile \alert{starts from the end} and steps that don't + change will be left untouched (not remade). + \item A single \emph{rule} can \alert{manage any number of + files}. + \item Make can identify independent steps internally and do them + in \alert{parallel}. + \item Make was \alert{designed for complex projects} with + thousands of files (all major Unix-like components), so it is + highly evolved and efficient. + \item Make is a very \alert{simple} and \alert{small} language, + thus easy to learn with great and free documentation (for + example + \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU + Make's manual}}). + \end{itemize} + + \column{5cm} + \includegraphics[width=\linewidth]{img/reproducible-makefile-highlighted-2.png} + \end{columns} + \end{frame} + + + + + + + + + + + + + + + + + + + + + + + + + + \newcommand{\focusonpaper}{} + \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} + \let\focusonpaper\undefined + + + \begin{frame}{Values in final report/paper} + All analysis \alert{results} (numbers, plots, tables) written in + paper's PDF as \alert{\LaTeX{} macros}. They are thus + \alert{updated automatically} on any change.\\ Shown here is a + portion of the \textsf{NoiseChisel} paper and its \LaTeX{} source + (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). + + \vspace{0.4cm} + \includegraphics[width=\linewidth]{img/reproducible-latex.png} + \end{frame} + + \begin{frame}{Values in final report/paper} + All analysis \alert{results} (numbers, plots, tables) written in + paper's PDF as \alert{\LaTeX{} macros}. They are thus + \alert{updated automatically} on any change.\\ Shown here is a + portion of the \textsf{NoiseChisel} paper and its \LaTeX{} source + (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). + + \vspace{0.4cm} + \includegraphics[width=\linewidth]{img/reproducible-latex-highlighted.png} + \end{frame} + + + + + + \begin{frame}{Analysis step results/values concatenated into a single file.} + All \LaTeX{} macros come from a \alert{single file}. + \begin{center} + \includegraphics[width=0.6\linewidth]{img/reproducible-macros.png} + \end{center} + \end{frame} + \begin{frame}{Analysis step results/values concatenated into a single file.} + All \LaTeX{} macros come from a \alert{single file}. + \begin{center} + \includegraphics[width=0.6\linewidth]{img/reproducible-macros-highlighted.png} + \end{center} + \end{frame} + + + + + + + + \begin{frame}{Analysis results stored as \LaTeX{} macros} + The analysis scripts write/update the \LaTeX{} macro values + automatically. + \begin{center} + \includegraphics[width=0.6\linewidth]{img/reproducible-write-macro.png} + \end{center} + \end{frame} + \begin{frame}{Analysis results stored as \LaTeX{} macros} + The analysis scripts write/update the \LaTeX{} macro values + automatically. + \begin{center} + \includegraphics[width=0.6\linewidth]{img/reproducible-write-macro-highlight.png} + \end{center} + \end{frame} + + + %% Make demo. + \begin{frame} + \LARGE + \vspace{1cm} + \hfill Let's see how the analysis is managed in a hypothetical project... + \end{frame} + \makedemoslide{img/data-lineage-1.pdf} + {Makefiles (\texttt{\*.mk}) keep contextually separate parts of the project, all imported into \texttt{top-make.mk}} + \makedemoslide{img/data-lineage-2.pdf} + {The ultimate purpose of the project is to produce a paper/report (in PDF).} + \makedemoslide{img/data-lineage-3.pdf} + {The narrative description, typography and references are in \texttt{paper.tex} \& \texttt{references.tex}.} + \makedemoslide{img/data-lineage-4.pdf} + {Analysis outputs (blended into the PDF as \LaTeX{} macros) come from \texttt{project.tex}.} + \makedemoslide{img/data-lineage-5.pdf} + {But analysis outputs must first be \emph{verified} (with checksums) before entering the report/paper.} + \makedemoslide{img/data-lineage-6.pdf} + {Basic project info comes from \texttt{initialize.tex}.} + \makedemoslide{img/data-lineage-7.pdf} + {Reported values about the downloaded inputs come from \texttt{download.tex}.} + \makedemoslide{img/data-lineage-8.pdf} + {... for example the number of rows in the second input (a catalog) of the project.} + \makedemoslide{img/data-lineage-9.pdf} + {The URL to download \texttt{input2.dat}, and a checksum to validate it, are stored in \texttt{INPUTS.conf}.} + \makedemoslide{img/data-lineage-10.pdf} + {Reported values from first analysis steps stored in \texttt{analysis1.tex}.} + \makedemoslide{img/data-lineage-11.pdf} + {... for example the average of the numbers in \texttt{out-1b.dat}.} + \makedemoslide{img/data-lineage-12.pdf} + {But \texttt{out-1b.dat} itself depends on other files and a paramter (for example a multiple of sigma).} + \makedemoslide{img/data-lineage-13.pdf} + {\texttt{out-1a.dat} is built from a downloaded dataset.} + \makedemoslide{img/data-lineage-14.pdf} + {Download URL and checksum of \texttt{input1.dat} also stored in \texttt{INPUTS.conf}.} + \makedemoslide{img/data-lineage-15.pdf} + {Reported values from second analysis steps stored in \texttt{analysis2.tex}.} + \makedemoslide{img/data-lineage-16.pdf} + {... for example the number of selected rows in \texttt{out-2b.dat}.} + \makedemoslide{img/data-lineage-17.pdf} + {\texttt{out-2b.dat} is derived from \texttt{out-1b.dat} (for example, rejected some of \texttt{out-1b.dat}'s rows).} + \makedemoslide{img/data-lineage-18.pdf} + {Reported values from third analysis steps stored in \texttt{analysis3.tex}.} + \makedemoslide{img/data-lineage-19.pdf} + {... for example measurements from both \texttt{out-3a.dat} and \texttt{out-3b.dat}.} + \makedemoslide{img/data-lineage-20.pdf} + {\texttt{out-3b.dat} is generated from an analysis on \texttt{out-2a.dat}.} + \makedemoslide{img/data-lineage-21.pdf} + {But \texttt{out-2a.dat} itself is generated from \texttt{input1.dat} and an analysis which has two settings.} + \makedemoslide{img/data-lineage-22.pdf} + {\texttt{out-3a.dat} also depends on \texttt{out-1a.dat} and an analysis with needs one parameter.} + + + + + + + + + + + + + + + + + + + + + + + + + + \newcommand{\allopacity}{1} + \begin{frame}{All questions have an answer now (in + \alert{plain text}: human \& computer readable/archivable).} + \include{tex/project-graph} \end{frame} + \newcommand{\gitlogo}{} + \begin{frame}{All questions have an answer now (in + \alert{plain text}: so we can use Git to keep its history).} + \include{tex/project-graph} + \end{frame} + + + + + + \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} + \newcommand{\projinit}{} + \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} + \newcommand{\projwork}{} + \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} + \newcommand{\tempevolve}{} + \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} + \newcommand{\mergewithtemp}{} + \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} + \newcommand{\tofuture}{} + \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} + \newcommand{\githappy}{} + \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} + \newcommand{\gitverified}{} + \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} + + \begin{frame}{Two recent examples (publishing Git checksum in abstract)} + \begin{columns} + \column{0.5\linewidth} + \centering + \includegraphics[width=0.8\linewidth]{img/firstpage-190911230.png} + \column{0.5\linewidth} + \centering + \includegraphics[width=0.8\linewidth]{img/firstpage-mnras491.png} + \end{columns} + \end{frame} + + \begin{frame}{Two recent examples (publishing Git checksum in abstract)} + \begin{columns} + \column{0.5\linewidth} + \centering + \includegraphics[width=0.8\linewidth]{img/firstpage-190911230-highlighted.png} + \column{0.5\linewidth} + \centering + \includegraphics[width=0.8\linewidth]{img/firstpage-mnras491-highlighted.png} + \end{columns} + \end{frame} + + + + + + \begin{frame}{Publication of the project} + + A reproducible project using Maneage will have the following + (\alert{plain text}) components: + \begin{itemize} + \item Makefiles. + \item \LaTeX{} source files. + \item Configuration files for software used in analysis. + \item Scripts/programming files (e.g., Python, Shell, AWK, C). + \end{itemize} + The \alert{volume} of the project's source will thus be + \alert{negligible} compared to a single figure in a paper + (usually $\sim100$ kilo-bytes). + + \vspace{1cm} The project's pipeline (customized Maneage) can be + \alert{published} in + \begin{itemize} + \item \alert{arXiv}: uploaded with the \LaTeX{} source to always + stay with the paper \\(for example + \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). The + file containing all macros must also be uploaded so arXiv's + server can easily build the \LaTeX{} source. + \item \alert{Zenodo}: Along with all the input datasets (many + Gigabytes) and software \\(for example + \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.3408481}{zenodo.3408481}}) and given a unique DOI. + \end{itemize} + \end{frame} + + + + + + \begin{frame}{Project source and its execution} + \begin{tcolorbox} + Programs \textcolor{gray}{[here: Scientific projects]} must be + written for \alert{people to read}... + + \hfill ...and only \emph{incidentally} for machines to + \emph{execute}. + + \vspace{2mm} + \hfill \footnotesize Harold Abelson, Structure and Interpretation of Computer Programs + \end{tcolorbox} + \end{frame} + + + + + + \begin{frame}[t]{General outline of using this system (for example \href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230})} + \vspace{1cm} + \texttt{\$ git clone http://gitlab.com/makhlaghi/iau-symposium-355{ }{ }{ }{ }\textcolor{gray}{\# Import the project.}}\\ + + \pause + \vspace{1.5cm} + \texttt{\$ ./project configure { }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# You will specify the build directory on your system,}}\\ + \texttt{{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# and it will build all software (about 1.5 hours).}} + + \pause + \vspace{1.5cm} + \texttt{\$ ./project make { }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# Does all the analysis and makes final PDF.}}\\ + \end{frame} + + + + + + + \begin{frame}{Future prospects...} + \large Adoption of reproducibility by many researchers will enable + the following: + + \vspace{1em} + \begin{itemize} + \setlength\itemsep{3mm} + \item A repository for education/training \textcolor{gray}{(PhD + students, or researchers in other fields)}. + \item Easy \alert{verification}/\alert{understanding} of other + research projects \textcolor{gray}{(when necessary)}. + \item Trivially \alert{test} different steps of others' work + \textcolor{gray}{(different configurations, software and etc)}. + \item Science can progress \alert{incrementally} + \textcolor{gray}{(shorter papers actually building on each + other!)}. + \item \alert{Extract meta-data} after the publication of a dataset + \textcolor{gray}{(for future ontologies or vocabularies)}. + \item Applying \alert{machine learning} on reproducible research + projects will allow us to solve some Big Data Challenges: + + \vspace{1em} + \begin{itemize} + \setlength\itemsep{2mm} + \item \emph{Extract the relevant parameters automatically}. + \item \emph{Translate the science to enormous samples}. + \item \emph{Believe the results when no one will have time to + reproduce}. + \item \emph{Have confidence in results derived using machine + learning or AI}. + \end{itemize} + \end{itemize} + \end{frame} + +\begin{frame}{Existing technologies (Independent environment)} + \begin{itemize} + \setlength\itemsep{7mm} + \item \textbf{Virtual machines:} + \begin{itemize} + \setlength\itemsep{3mm} + \item Contain the \alert{full operating system}, are thus very large ($\times$Gigabytes). + \item In \emph{binary} format (decoding a built VM's environment is extremely hard and inaccurate). + \end{itemize} + \item \textbf{Containers:} (For example Docker or Singularity) + \begin{itemize} + \setlength\itemsep{3mm} + \item Similar to virtual machines, but \alert{without low-level kernel} (use host's kernel). + \item \alert{Will fail} as soon as kernel is no longer supported\\(for example Docker currently only supports Linux kernel 3.10 and above \alert{from 2013}). + \item Good solutions for software engineers (that need to \emph{reproduce a bug's environment today}). + \item Docker is modular, needs root previlages (not available in HPCs), Dockerfiles allow incompleteness\\(especially in the common scenario of using the operating system's package manager, see next slide) + \item Singularity is monolithic and thus can be very large. + \item In \alert{binary} format (similar to VMs, especially when OS package managers are used). + \end{itemize} + \end{itemize} + + \vspace{3mm} +In summary, they only \alert{store a built} environment (they are outputs, not good for archiving). + +\end{frame} + + + + + +\begin{frame}{Existing technologies (Package managers)} + + \begin{itemize} + \item \textbf{Operating system package managers:} + \begin{itemize} + \setlength\itemsep{2mm} + \item For example \texttt{apt} or \texttt{yum} for Debian-based and RedHat-based GNU/Linux operating systems\\(the most common way to install software). + \item Tightly intertwined with the operating system's components\\(arbitrary control of software versions is not easily possible). + \item Older software (for example +5 years) is usually removed. + \end{itemize} + \item \textbf{Conda/Anaconda:} + \begin{itemize} + \setlength\itemsep{2mm} + \item Conda has build instructions for software and their dependencies. + \item But it doesn't go down to the C library or the lower-level components of operating system. + \item It is written in Python (can't be used later when current Python is depreciated). + \item Authors of Uhse+2019\footnote{\url{http://dx.doi.org/10.1002/cppb.20097}} report\footnote{\url{https://github.com/conda-forge/conda-forge.github.io/issues/787}} that their Conda environment breaks roughly every 3 months\\(Conda environments need to be updated to be used later! Breaking reproducibility). + \end{itemize} + \item \textbf{Nix, or GNU Guix:} + \begin{itemize} + \setlength\itemsep{2mm} + \item Deliver perfectly reproducible builds (bit-wise reproducibility of software), needs root access. + \item Doesn't \emph{require} documentation of dependencies. + \end{itemize} + \item \textbf{Spack:} Similar to Nix/Guix but written in Python. + \end{itemize} +\end{frame} + +\begin{frame}{Existing technologies (workflow tools)} + \begin{itemize} + \setlength\itemsep{4mm} + \item \textbf{Binder:} (\url{https://mybinder.org}) Docker+Conda. + \item \textbf{Galaxy:} (\url{https://galaxyproject.org}) A web-based user interface, primarily designed for genomics. The GUI make it hard to automate, and has too many dependencies. Very similar to GenePattern (2008 to 2017): with +40,000 users and $\sim4000$ jobs running per week, but cut due to funding. + \item \textbf{Sciunit:} (\url{https://sciunit.run}) Parses program binaries to try to infer their dependencies and copy them. + \item \textbf{Popper:} (\url{https://falsifiable.us}), HCL (previously used by GitHub Actions) + Conda + Docker. + \item \textbf{WholeTale:} (\url{https://wholetale.org}) Jupyter + Conda + Docker. + \item \textbf{Image Processing On Line (IPOL) journal:} The best example of publishing algorithms/methods I have seen, only useful for very basic/low-level software. + \end{itemize} + \alert{Summary}: except for IPOL, most solutions surveyed have far too many dependencies to be usable \alert{beyond the immediate future}. +\end{frame} + + + + + \begin{frame}{Summary:} + + Maneage is introduced as a customizable template that will do the + following steps/instructions (all in simple plain text files). + \begin{itemize} + \item \alert{Automatically downloads} the necessary + \emph{software} and \emph{data}. + \item \alert{Builds} the software in a \alert{closed + environment}. + \item Runs the software on data to \alert{generate} the final + \alert{research results}. + \item A modification in one part of the analysis will only + result in re-doing that part, not the whole project. + \item Using LaTeX macros, paper's figures, tables and numbers + will be \alert{Automatically updated} after a change in + analysis. Allowing the scientist to focus on the scientific + interpretation. + \item The whole project is under \alert{version control} (Git) + to allow easy reversion to a previous state. This + \alert{encourages tests/experimentation} in the analysis. + \item The \alert{Git commit hash} of the project source, is + \alert{printed} in the published paper and \alert{saved on + output} data products. Ensuring the + integrity/reproducibility of the result. + \item \colorbox{green!30!white}{These slides are available at + \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.} + \end{itemize} + + \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, + top=1pt, bottom=1pt] + For a technical description of Maneage's implementation, as well + as a checklist to customize it, and tips on good practices, + please see this page: + + \textcolor{blue}{\footnotesize\url{https://gitlab.com/maneage/project/-/blob/maneage/README-hacking.md}} + \end{tcolorbox} + \end{frame} \end{document} -- cgit v1.2.1