\documentclass[9pt]{beamer} %% Beamer settings. %\setbeamertemplate{footline}[frame number] %% Packages to import. \usepackage{tcolorbox} %For a color-box. \usepackage{textcomp} %For a copyright sign. %% To simplify arXiv links \newcommand{\arxivlink}[1]{{\footnotesize (\textcolor{blue}{\href{https://arxiv.org/abs/#1}{arXiv:#1}})}} %% Set the title \title{\LARGE \textbf{BIG} data, \textbf{BIG} responsibility:\\ \small Template/framework for reproducible scientific projects/papers} %% Set the author \author{Mohammad Akhlaghi\\\vspace{2mm}\footnotesize Instituto de Astrof\'isica de Canarias ({\scriptsize IAC}),\\Tenerife, Spain\\ \vspace{0.5cm} \raisebox{+0.8\height}{\includegraphics[width=2cm]{img/ull.png}}{ }{ }\includegraphics[width=1.5cm]{img/iac.png}{ }{ }{ }\includegraphics[width=2cm]{img/sundial.png}\vspace{0.5cm} } %% Set the date and insitutional logos. \date{\scriptsize \href{https://www.astro.rug.nl/~sundial/MidtermMeeting.html}{SUNDIAL Midterm meeting}, June 5th, 2019\\ Ghent, Belgium} %% For a wider writing width. \newcommand\Wider[2][3em]{% \makebox[\linewidth][c]{% \begin{minipage}{\dimexpr\textwidth+#1\relax} \raggedright#2 \end{minipage}% }% } %% TiKZ \usepackage{tikz} \usetikzlibrary{graphs} \usetikzlibrary{positioning} \tikzset{ bbox/.style={ rectangle, minimum width=2.5cm, rounded corners=2mm, very thick,draw=black!50, top color=white, bottom color=black!20 } } \tikzset{ rbox/.style={ rectangle, dotted, minimum width=2.5cm, rounded corners=2mm, very thick,draw=red!50!black!50, top color=white, bottom color=red!50!black!20 } } \tikzset{ gbox/.style={ rectangle, minimum width=2.5cm, very thick, draw=green!50!black!50, top color=white, bottom color=green!50!black!20 } } \begin{document} \begin{frame} \titlepage \end{frame} % \begin{frame}{Necessity of (exactly) reproducible research} % \begin{itemize} % \setlength\itemsep{0.7cm} % \item To be considered \alert{scientific}, any result has to be % reproducible. % \item The tsunami of data, fast internet, and high processing % power have made it very easy to \alert{promptly arrive at a % result}. % \item But these factors have also greatly increased the % \alert{complexity} of an analysis. Making it impossible to % exactly describe all steps in a traditional published paper. % \item Most scientific papers thus \alert{ignore some ``details''} % (as they interpret it). % \item But due to the complexity, even a small deviation from the % exact result, can be due to many different parts of the % analysis. Hence, its \alert{critical to exactly reproduce} a % result. % \end{itemize} % \end{frame} \begin{frame}{Reproducibility crisis in the sciences/astronomy} \begin{tcolorbox}[title=Snakes on a Spaceship -- An Overview of Python in Heliophysics] \small ``...\alert{inadequate analysis descriptions} and loss of scientific data have made scientific studies \alert{difficult} or \alert{impossible} to replicate''. From Burrell+2018, \arxivlink{1901.00143}. \end{tcolorbox} \pause \begin{tcolorbox}[title=Perspectives on Reproducibility and Sustainability of Open-Source Scientific Software] ``It is our interest that NASA adopt an open-code policy because without it, reproducibility in computational science is \alert{needlessly hampered}''. From Oishi+2018, \arxivlink{1801.08200}. \end{tcolorbox} \pause \begin{tcolorbox}[title=Schroedinger's code: source code availability and link persistence\\ in astrophysics] ``We were \alert{unable to find source code} online ... for $40.4\%$ of the codes used in the research we looked at''. From Allen+2018, \arxivlink{1801.02094}. \end{tcolorbox} \end{frame} \begin{frame} \centering \includegraphics[width=0.6\linewidth]{img/schrodinger-code.png} \footnotesize Original image from \href{https://www.redbubble.com/people/seriesclothing/works/28520432-the-flash-ciscos-shirt-wanted-dead-and-alive-scr-dingers-cat}{\texttt{https://www.redbubble.com}} \end{frame} \newcommand{\nodeopacity}{1} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\paperinit}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\sver}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\srep}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\dver}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\ddver}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\confopt}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\confenv}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\db}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\calib}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\corr}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\runord}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\runopt}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\humanerr}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\depupdate}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\coauth}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\varsinpaper}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\recordinfo}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\softcite}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\prevchange}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\paperfinal}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} %% Don't show the happy scientist any more. \let\paperfinal\undefined \let\paperinit\undefined \begin{frame}{Science is a tricky business} \includegraphics[width=\linewidth]{img/nature-cartoon.jpg} \vspace{-0.2cm} {\tiny Image from nature.com (``\href{https://www.nature.com/articles/d41586-017-07522-z}{Five ways to fix statistics}'', Nov 2017)} \vspace{0.2cm} \begin{tcolorbox} \small Data analysis [...] is a human behaviour. Researchers who hunt hard enough will turn up a result that fits statistical criteria, but their \alert{discovery} will probably be a \alert{false positive}. \hfill Five ways to fix statistics, Nature, 551, Nov 2017. \end{tcolorbox} \end{frame} \begin{frame}{Necessity of (exactly) reproducible research} \begin{tcolorbox}[title=Don't forget that:] \centering Science is defined by its METHOD, \alert{not} its result. \end{tcolorbox} \vspace{0.5cm} \begin{itemize} \setlength\itemsep{0.6cm} \item The software(s) used, configuration file(s), the order of steps taken, along with the input data are necessary for reproducibility. \item \alert{A solution} is proposed here, which if adopted from the start, can greatly \alert{simplify a scientific research project} and \alert{allow full/exact reproducibility} once it is published. \item In the next slides, we'll review the template from the highest level (final research paper) to the lowest (setting up the research environment). \end{itemize} \end{frame} \renewcommand{\nodeopacity}{0.3} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \begin{frame}{Values in final report/paper} All necessary analysis/processing \alert{input} and \alert{output} values are written into the final report as \LaTeX{} macros. Shown here is a portion of the \textsf{NoiseChisel} paper and its source (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). \vspace{1.2cm} \includegraphics[width=\linewidth]{img/reproducible-latex.png} \end{frame} \begin{frame}{Values in final report/paper} All necessary analysis/processing \alert{input} and \alert{output} values are written into the final report as \LaTeX{} macros. Shown here is a portion of the \textsf{NoiseChisel} paper and its source (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). \vspace{1.2cm} \includegraphics[width=\linewidth]{img/reproducible-latex-highlighted.png} \end{frame} \begin{frame}{Values come from a single file} All the \LaTeX{} macros (processing inputs and outputs) come from a \alert{single file}. This file is the \alert{final product} of the analysis steps. \begin{center} \includegraphics[width=0.8\linewidth]{img/reproducible-macros.png} \end{center} \end{frame} \begin{frame}{Values come from a single file} All the \LaTeX{} macros (processing inputs and outputs) come from a \alert{single file}. This file is the \alert{final product} of the analysis steps. \begin{center} \includegraphics[width=0.8\linewidth]{img/reproducible-macros-highlighted.png} \end{center} \end{frame} \begin{frame}{Values written during analysis} Various steps of the analysis write the macro values as soon as they are calculated internally. \begin{center} \includegraphics[width=0.8\linewidth]{img/reproducible-write-macro.png} \end{center} \end{frame} \begin{frame}{Values written during analysis} Various steps of the analysis write the macro values as soon as they are calculated internally. \begin{center} \includegraphics[width=0.8\linewidth]{img/reproducible-write-macro-highlight.png} \end{center} \end{frame} \begin{frame}{Reproducible science: Template is managed through a Makefile} \small \begin{columns} \column{5.5cm} All steps (downloading and analysis) is managed by Makefiles (example from \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}): \begin{itemize} \setlength\itemsep{0.2cm} \item Unlike a script which always starts from the top, a Makefile \alert{starts from the end} and steps that don't change will be left untouched (not remade). \item A single \emph{rule} can \alert{manage any number of files}. See the examples here where \textsf{NoiseChisel} and \textsf{MakeCatalog} are run separately on \alert{$\sim20$ files} (different filters/fields) with a single rule. \item Make can identify independent steps internally and do them in \alert{parallel}. \item Make was \alert{designed for complex problems} with thousands of files (all major Unix-like components), so it is highly evolved and efficient. \item Make is a very \alert{simple} and \alert{small} language, thus easy to learn with great and free documentation (for example \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU Make's manual}}, usable to learn all implementations). \end{itemize} \column{5.5cm} \includegraphics[width=\linewidth]{img/reproducible-makefile.png} \end{columns} \end{frame} \begin{frame}{Predefined/exact software tools} \small \begin{columns} \column{5.5cm} \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, top=1pt, bottom=1pt, title=Reproducibility \& software] \footnotesize Reproducing the environment (specific \alert{software versions}, \alert{build instructions} and \alert{dependencies}) is also critically important for reproducibility. \end{tcolorbox} \begin{itemize} \setlength\itemsep{0.4cm} \item \emph{Containers} or \emph{Virtual Machines} are a \alert{binary black box}: just contain the environment, not how to set it up, or its history. They are also an overhead. \item This template \alert{installs fixed versions} of all necessary research software and their dependencies, down to the command-line shell, C compiler, POSIX tools and Python interpreter. It just avoids very low-level OS elements like the kernel or linker. \item Installs similar environment on \alert{GNU/Linux}, or \alert{macOS} systems. \item Works very much like a package manager (e.g., \alert{\texttt{apt}} or \alert{\texttt{brew}}). \end{itemize} \column{5.5cm} \includegraphics[width=\linewidth]{img/software.png} \end{columns} \end{frame} \begin{frame}{Predefined/exact software tools} \small \begin{columns} \column{5.5cm} \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, top=1pt, bottom=1pt, title=Reproducibility \& software] \footnotesize Reproducing the environment (specific \alert{software versions}, \alert{build instructions} and \alert{dependencies}) is also critically important for reproducibility. \end{tcolorbox} \begin{itemize} \setlength\itemsep{0.4cm} \item \emph{Containers} or \emph{Virtual Machines} are a \alert{binary black box}: just contain the environment, not how to set it up, or its history. They are also an overhead. \item This template \alert{installs fixed versions} of all necessary research software and their dependencies, down to the command-line shell, C compiler, POSIX tools and Python interpreter. It just avoids very low-level OS elements like the kernel or linker. \item Installs similar environment on \alert{GNU/Linux}, or \alert{macOS} systems. \item Works very much like a package manager (e.g., \alert{\texttt{apt}} or \alert{\texttt{brew}}). \end{itemize} \column{5.5cm} \includegraphics[width=\linewidth]{img/software-highlighted.png} \end{columns} \end{frame} \newcommand{\redbdir}{\textcolor{green!80!black}{/TEMPLATE/BUILD/DIRECTORY/software/installed/lib}} \begin{frame}{Dependencies are cleanly managed} \begin{itemize} \item All the software are configured and built to use the \alert{template's own builds}: indepenent of host system (\textcolor{green!80!black}{in green}). \item Template even builds a fixed GNU C Compiler (\alert{GCC}). \item Only extremely low-level dependencies (for example C library and Kernel) not built. \begin{itemize} \item GNU C library will also be added later (\alert{in red}). \end{itemize} \end{itemize} \vspace{0.5cm} \tiny\texttt{ \$ ldd .local/bin/astnoisechisel\\ \hspace{0.5cm}libgit2.so.26 => \redbdir/libgit2.so.26 (0x00007febb5232000)\\ \hspace{0.5cm}libtiff.so.5 => \redbdir/libtiff.so.5 (0x00007febb51b8000)\\ \hspace{0.5cm}liblzma.so.5 => \redbdir/liblzma.so.5 (0x00007febb5190000)\\ \hspace{0.5cm}libjpeg.so.9 => \redbdir/libjpeg.so.9 (0x00007febb5153000)\\ \hspace{0.5cm}z.so.1 => \redbdir/libz.so.1 (0x00007febb5136000)\\ \hspace{0.5cm}wcs.so.6 => \redbdir/libwcs.so.6 (0x00007febb4fcc000)\\ \hspace{0.5cm}cfitsio.so.8 => \redbdir/libcfitsio.so.8 (0x00007febb4caf000)\\ \hspace{0.5cm}curl.so.4 => \redbdir/libcurl.so.4 (0x00007febb4c35000)\\ \hspace{0.5cm}ssl.so.1.1 => \redbdir/libssl.so.1.1 (0x00007febb4b9b000)\\ \hspace{0.5cm}crypto.so.1.1 => \redbdir/libcrypto.so.1.1 (0x00007febb48b5000)\\ \hspace{0.5cm}gsl.so.23 => \redbdir/libgsl.so.23 (0x00007febb4626000)\\ \hspace{0.5cm}gslcblas.so.0 => \redbdir/libgslcblas.so.0 (0x00007febb45e2000)\\ \hspace{0.5cm}gnuastro.so.8 => \redbdir/libgnuastro.so.8 (0x00007febb419e000)\\ \hspace{0.5cm}bz2.so.1.0 => \redbdir/libbz2.so.1.0 (0x00007febb3e20000)\\ \hspace{0.5cm}\alert{m.so.6} => /usr/lib/libm.so.6 (0x00007febb4025000)\\ \hspace{0.5cm}\alert{pthread.so.0} => /usr/lib/libpthread.so.0 (0x00007febb4004000)\\ \hspace{0.5cm}\alert{c.so.6} => /usr/lib/libc.so.6 (0x00007febb3e3f000)\\ \hspace{0.5cm}rt.so.1 => /usr/lib/librt.so.1 (0x00007febb3e35000)\\ \hspace{0.5cm}dl.so.2 => /usr/lib/libdl.so.2 (0x00007febb3e1b000)\\ \hspace{0.5cm}linux-vdso.so.1 (0x00007ffcf2497000)\\ \hspace{0.5cm}/lib64/ld-linux-x86-64.so.2 => /usr/lib64/ld-linux-x86-64.so.2 (0x00007febb53c6000) } \end{frame} \begin{frame}{Advantages of this build system} \begin{columns} \column{7cm} \begin{itemize} \setlength\itemsep{1cm} \item No need for \alert{root}/administrator \alert{permissions} (on servers or super computers). \item Whole system is built \alert{automatically} on any Unix-like operating system (less 2 hours). \item Dependencies of different projects will \alert{not conflict}. \item (Almost) all depencies are \alert{exactly} documened and can be reproduced. \end{itemize} \column{4cm} \includegraphics[width=\linewidth]{img/unchained.jpg}\\ \tiny \url{https://natemowry2.wordpress.com} \end{columns} \end{frame} \begin{frame}{Software acknowledgment and citation automatically generated in paper} \includegraphics[width=\linewidth]{img/software-cite.png} \end{frame} \begin{frame}{Software acknowledgment and citation automatically generated in paper} \includegraphics[width=\linewidth]{img/software-cite-highlighted.png} \end{frame} % \begin{frame}{Reproducing the result and report/paper} % The two \alert{simple} and \alert{familiar} commands below are % enough to exactly reproduce the results at any time. % % \begin{itemize} % \item[] \texttt{\$ ./configure} % \item[] \texttt{\$ make} % \end{itemize} % % With \texttt{./configure}, you specify the local directories to % use. All necessary \alert{software} are then \alert{downloaded} % and installed there (independent of your OS or other projects). % % \vspace{0.3cm} With \texttt{make}, input \alert{data} from online % archives (databases) are \alert{downloaded}, if not locally % available, the processing is done, and the \LaTeX{} paper is built % as a PDF (e.g., see % \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}} % or % \textcolor{blue}{\small\href{https://gitlab.com/makhlaghi/reproducible-paper-output/raw/master/paper.pdf}{template's % output}}). % % \vspace{0.3cm} Enabling version control (e.g., with \alert{Git}) % encourages testing different ideas while not harming the % initial/base result (thus encouraging \alert{creativity} and % brainstorming during the project). % % \vspace{0.3cm} After publication, \alert{readers} can % \alert{change} the input configurations and the numbers and % figures of the reproduced paper will respectively change. This % encourages creativity and brainstorming after the project as well % as sharing of (the hardly gained) experiences with the whole % community. % \end{frame} \renewcommand{\nodeopacity}{1} \begin{frame}{Everything in plain text (machine and human readable)} \include{tex/project-graph} \end{frame} \newcommand{\gitlogo}{} \begin{frame}{Everything in plain text (machine and human readable)} \include{tex/project-graph} \end{frame} \begin{frame}{New projects branch from template} \include{tex/git-branch} \end{frame} \newcommand{\projbranch}{} \begin{frame}{New projects branch from template} \include{tex/git-branch} \end{frame} \newcommand{\mergebranch}{} \begin{frame}{New projects branch from template} \include{tex/git-branch} \end{frame} \newcommand{\tofuture}{} \begin{frame}{New projects branch from template} \include{tex/git-branch} \end{frame} \newcommand{\githappy}{} \begin{frame}{New projects branch from template} \include{tex/git-branch} \end{frame} \begin{frame}{Publication of the project} A reproducible project using this template will have the following (\alert{plain text}) components: \begin{itemize} \item Makefiles. \item \LaTeX{} source files. \item Configuration files for software used in analysis. \item Scripts/programming files (e.g., Python, Shell, AWK, C). \end{itemize} The \alert{volume} of the project's source will thus be \alert{negligible} compared to a single figure in a paper (usually $\sim100$ kilo-bytes). \vspace{1cm} The project's pipeline (customized template) can be \alert{published} in \begin{itemize} \item \alert{arXiv}: uploaded with the \TeX{} source to always stay with the paper \\(for example \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). The file containing all macros must also be uploaded so arXiv's server can easily build the \LaTeX{} source. \item \alert{Zenodo}: Along with all the input datasets (many Gigabytes) and software \\(for example \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}) and given a unique DOI. \end{itemize} \end{frame} \begin{frame}{GOOD NEWS: RDA adoption grant to IAC for this template} \begin{center} \includegraphics[width=3cm]{img/rda.png}\hspace{1cm} \includegraphics[width=1.8cm]{img/iac.png} \includegraphics[width=\linewidth]{img/h2020.png} \end{center} \vspace{1cm} For this template, the \alert{IAC} is selected as a \alert{Top European organization} funded to adopt RDA Recommendations and Outputs. \vspace{1cm} \scriptsize \begin{itemize} \item Research Data Alliance was launched by the \alert{European Commission}, NSF, National Institute of Standards and Technology, and the Australian Government’s Department of Innovation. \item RDA Outputs are the technical and social infrastructure solutions developed by RDA Working Groups or Interest Groups that enable data sharing, exchange, and interoperability. \end{itemize} \vspace{0.2cm} \centering \end{frame} \begin{frame}{Summary:} A fully working template/framework is introduced that will do the following steps/instructions (all in simple plain text files). \begin{itemize} \item \alert{Automatically downloads} the necessary \emph{software} and \emph{data}. \item \alert{Builds} the software in a \alert{closed environment}. \item Runs the software on data to \alert{generate} the final \alert{research results}. \item A modification in one part of the analysis will only result in re-doing that part, not the whole project. \item Using LaTeX macros, paper's figures, tables and numbers will be \alert{Automatically updated} after a change in analysis. Allowing the scientist to focus on the scientific interpretation. \item The whole project is under \alert{version control} (Git) to allow easy reversion to a previous state. This \alert{encourages tests/experimentation} in the analysis. \item The \alert{Git commit hash} of the project source, is \alert{printed} in the published paper and \alert{saved on output} data products. Ensuring the integrity/reproducibility of the result. \end{itemize} \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, top=1pt, bottom=1pt] For a technical description of the template's implementation, as well as a checklist to customize it, and tips on good practices, please see this page: \textcolor{blue}{\footnotesize\url{https://gitlab.com/makhlaghi/reproducible-paper/blob/master/README-hacking.md}} \end{tcolorbox} \end{frame} \end{document}