\documentclass[9pt,usenames,dvipsnames,aspectratio=169]{beamer} %% Beamer settings. %\setbeamertemplate{footline}[frame number] %% Packages to import. \usepackage{tcolorbox} %For a color-box. \usepackage{textcomp} %For a copyright sign. %% To simplify arXiv links \newcommand{\arxivlink}[1]{{\footnotesize (\textcolor{blue}{\href{https://arxiv.org/abs/#1}{arXiv:#1}})}} %% Set the title \title{\huge\textbf{BIG} Data, \textbf{BIG} responsibility \\ {\normalsize (Data lineage management with template for reproducible scientific papers)}} %% Set the author \author{\href{http://akhlaghi.org}{Mohammad Akhlaghi}\\\vspace{2mm}\footnotesize Instituto de Astrof\'isica de Canarias ({\scriptsize IAC}), Tenerife, Spain\\ \vspace{0.25cm} \raisebox{+0.8\height}{\includegraphics[width=1.5cm]{img/ull.png}}{ }{ }\includegraphics[width=1cm]{img/iac.png}{ }{ }{ }\includegraphics[width=1.5cm]{img/sundial.png}\vspace{0.5cm}\\\vspace{-0.5cm} \hspace{-0.9mm}\includegraphics[width=1.2cm]{img/rda.png}\vspace{0.5cm} } %% Set the date and insitutional logos. \date{\scriptsize Collibra (video-conf), August 6th, 2019\\ \vspace{0.1cm} Slides available at \textcolor{blue}{\url{http://akhlaghi.org/pdf/reproducible-paper.pdf}}} %% For a wider writing width. \newcommand\Wider[2][3em]{% \makebox[\linewidth][c]{% \begin{minipage}{\dimexpr\textwidth+#1\relax} \raggedright#2 \end{minipage}% }% } %% TiKZ \usepackage{tikz} \usetikzlibrary{graphs} \usetikzlibrary{positioning} \tikzset{ bbox/.style={ rectangle, minimum width=2.5cm, rounded corners=2mm, very thick,draw=black!50, top color=white, bottom color=black!20 } } \tikzset{ rbox/.style={ rectangle, dotted, minimum width=2.5cm, rounded corners=2mm, very thick,draw=red!50!black!50, top color=white, bottom color=red!50!black!20 } } \tikzset{ gbox/.style={ rectangle, minimum width=2.5cm, very thick, draw=green!50!black!50, top color=white, bottom color=green!50!black!20 } } \begin{document} \begin{frame} \titlepage \end{frame} \newcommand{\imgtdir}{/home/mohammad/documents/personal/professional/astronomy/talks/in-prep/images} \begin{frame}{Reproducibility is critically important for astronomy} \small \begin{columns} \column{6cm} Example: Outer surface brightness of M51 in a \alert{single-exposure} SDSS image, using NoiseChisel. \vspace{2mm} \begin{itemize} \setlength\itemsep{2mm} \item Outer wing detected to \alert{$\rm{S/N}=1/4$} or $\sim28.3$ mag/arcsec$^2$. \item \textcolor{blue}{\href{https://www.gnu.org/software/gnuastro/manual/html_node/Detecting-large-extended-targets.html}{Complete tutorial}} in manual fully describes how to derive/reproduce this result: \begin{itemize} \item \alert{Run-time} options/configuration. \item Steps \alert{before/after} NoiseChisel. \end{itemize} \item Deep/orange image from Watkins+2015 (\textcolor{blue}{\href{https://arxiv.org/abs/1501.04599}{arXiv:1501.04599}}). \item Therefore: \begin{itemize} \item Default settings not enough. \item Result not just from NoiseChisel. \end{itemize} \end{itemize} \vspace{0.5mm} \begin{tcolorbox}[boxsep=0pt,left=1mm,right=1mm,top=1mm,bottom=1mm] Simply reporting in your paper that ``\emph{\alert{we used NoiseChisel}}'' is \alert{not enough} to reproduce, understand, or verify your result. \end{tcolorbox} \column{9cm} \includegraphics[width=0.49\linewidth]{\imgtdir/m51-lf.pdf} \includegraphics[width=0.49\linewidth]{\imgtdir/m51-detection.pdf} \includegraphics[width=0.49\linewidth]{\imgtdir/m51-edge.pdf} \includegraphics[width=0.49\linewidth]{\imgtdir/m51-watkins-zoom.png} \end{columns} \end{frame} \begin{frame}{Reproducibility crisis in the sciences/astronomy} \begin{tcolorbox}[title=Snakes on a Spaceship -- An Overview of Python in Heliophysics] \small ``...\alert{inadequate analysis descriptions} and loss of scientific data have made scientific studies \alert{difficult} or \alert{impossible} to replicate''. From Burrell+2018, \arxivlink{1901.00143}. \end{tcolorbox} \pause \begin{tcolorbox}[title=Perspectives on Reproducibility and Sustainability of Open-Source Scientific Software] ``It is our interest that NASA adopt an open-code policy because without it, reproducibility in computational science is \alert{needlessly hampered}''. From Oishi+2018, \arxivlink{1801.08200}. \end{tcolorbox} \pause \begin{tcolorbox}[title=Schroedinger's code: source code availability and link persistence in astrophysics] ``We were \alert{unable to find source code} online ... for $40.4\%$ of the codes used in the research we looked at''. From Allen+2018, \arxivlink{1801.02094}. \end{tcolorbox} \end{frame} \begin{frame} \centering \includegraphics[width=0.45\linewidth]{img/schrodinger-code.jpg} \footnotesize Original image from \href{https://www.redbubble.com/people/seriesclothing/works/28520432-the-flash-ciscos-shirt-wanted-dead-and-alive-scr-dingers-cat}{\texttt{https://www.redbubble.com}} \end{frame} \begin{frame}[t]{Types of reproducibility} \vspace{-5mm} \begin{columns}[t] \column{0.5\linewidth} \begin{center} \large\textbf{Hardware/Statistical reproducibility} \rule{0.5\linewidth}{1pt} \end{center} \begin{itemize} \setlength\itemsep{0.5em} \item Involves data \alert{collection}. \item Inherently includes \alert{measurements errors}\\(can never be exactly reproduced). \item Example: Raw telescope image/spectra. \item \alert{\textbf{NOT DISCUSSED HERE.}} \end{itemize} \vspace{3.5mm} \begin{center} \vspace{-5mm} \includegraphics[width=0.7\linewidth]{img/hale-prime-focus.jpg}\\ \vspace{-0.6mm} \tiny \href{http://slittlefair.staff.shef.ac.uk/teaching/phy217/lectures/telescopes/L07/index.html}{http://slittlefair.staff.shef.ac.uk} \end{center} \column{0.5\linewidth} \begin{center} \large\textbf{Software/Deterministic reproducibility} \rule{0.5\linewidth}{1pt} \end{center} \begin{itemize} \setlength\itemsep{1em} \item Involves data \alert{analysis}, or simulations. \item Starts \alert{after} data is collected/digitized. \item Example: $2+2=4$ (i.e., sum of datasets). \item \textbf{\textcolor{green!50!black}{DISCUSSED HERE.}} \end{itemize} \centering \vspace{3mm} \includegraphics[width=0.8\linewidth]{img/digital-tunnel.jpg}\\ \vspace{-0.6mm} \tiny \href{https://tsongas.com/newsletter_articles/the-new-electronic-version-of-the-advantage/digital-tunnel-wallpaper/}{https://tsongas.com} \end{columns} \end{frame} %% Step-by-step slides. \newcommand{\allopacity}{1} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\paperinit}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\sver}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \begin{frame}{Different package managers have different versions of software (repology.org, 2019/08/06)} \begin{columns} \column{7cm} \center GNU Astronomy Utilities (Gnuastro)\\ \includegraphics[width=3cm]{img/distros-gnuastro.pdf} \column{7cm} \center Astropy\\ \includegraphics[width=2.5cm]{img/distros-astropy.pdf} \end{columns} \end{frame} \newcommand{\srep}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\dver}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\ddver}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\confopt}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\confenv}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \begin{frame}{Example: Matplotlib (a Python visualization library) build dependencies} \Wider[5em]{ %\vspace{5mm} \begin{center} \includegraphics[width=0.9\linewidth]{img/matplotlib.png} \end{center} \vspace{3mm}\tiny From ``Attributing and Referencing (Research) Software: Best Practices and Outlook from Inria'' (Alliez et al. 2019, \textcolor{blue}{\href{https://hal.archives-ouvertes.fr/hal-02135891}{hal-02135891}}) } \end{frame} \begin{frame}{Impact of ``Dependency hell'' on native building in various hardware (CPU architectures)} \begin{columns} \column{7cm} \includegraphics[width=0.9\linewidth]{img/cpu-arch-astropy.png} Astropy depends on Matplotlib \column{6cm} \includegraphics[width=0.9\linewidth]{img/cpu-arch-gnuastro.png} GNU Astronomy Utilities doesn't. \end{columns} \end{frame} \newcommand{\containers}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\db}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\calib}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\corr}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\runord}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\runopt}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\humanerr}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\depupdate}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\coauth}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\varsinpaper}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\recordinfo}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\softcite}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\prevchange}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \newcommand{\paperfinal}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} %% Don't show the happy scientist or the existing containers box. \let\paperinit\undefined \let\allopacity\undefined \let\paperfinal\undefined \let\containers\undefined \begin{frame}{Science is a tricky business} \begin{center} \includegraphics[width=0.9\linewidth]{img/nature-cartoon.jpg} \end{center} \vspace{-0.3cm}\hfill {\tiny Image from nature.com (``\href{https://www.nature.com/articles/d41586-017-07522-z}{Five ways to fix statistics}'', Nov 2017)}\hspace{7mm} \vspace{-1mm} \begin{tcolorbox}[boxsep=0pt,left=1mm,right=1mm,top=1mm,bottom=1mm] \small Data analysis [...] is a \alert{human behavior}. Researchers who hunt hard enough will turn up a result that fits statistical criteria, but their \alert{discovery} will probably be a \alert{false positive}. \hfill Five ways to fix statistics, Nature, 551, Nov 2017. \end{tcolorbox} \end{frame} \begin{frame}{Necessity of (exactly) reproducible research} \begin{tcolorbox}[title=Don't forget that:] \centering Science is defined by its METHOD, \alert{not} its result. \end{tcolorbox} \vspace{0.5cm} \begin{itemize} \setlength\itemsep{0.6cm} \item The software(s) used, configuration file(s), the order of steps taken, along with the input data are necessary for reproducibility. \item \alert{A solution} is proposed here, which if adopted from the start, can greatly \alert{simplify a scientific research project} and \alert{allow full/exact reproducibility} once it is published. \item In the next slides, we'll review the template from the highest level (final research paper) to the lowest (setting up the research environment). \end{itemize} \end{frame} \newcommand{\focusonpackages}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \let\focusonpackages\undefined \begin{frame}{Predefined/exact software tools} \small \begin{columns} \column{10cm} \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, top=1pt, bottom=1pt, title=Reproducibility \& software] \footnotesize Reproducing the environment (specific \alert{software versions}, \alert{build instructions} and \alert{dependencies}) is also critically important for reproducibility. \end{tcolorbox} \vspace{2cm} \begin{itemize} \setlength\itemsep{0.6cm} \item \emph{Containers} or \emph{Virtual Machines} are a \alert{binary black box}. \item This template \alert{installs fixed versions} of all necessary research software and their dependencies. \item Installs similar environment on \alert{GNU/Linux}, or \alert{macOS} systems. \item Works very much like a package manager (e.g., \alert{\texttt{apt}} or \alert{\texttt{brew}}). \end{itemize} \column{5cm} \includegraphics[width=\linewidth]{img/version.png} \end{columns} \end{frame} \begin{frame}{Predefined/exact software tools} \small \begin{columns} \column{10cm} \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, top=1pt, bottom=1pt, title=Reproducibility \& software] \footnotesize Reproducing the environment (specific \alert{software versions}, \alert{build instructions} and \alert{dependencies}) is also critically important for reproducibility. \end{tcolorbox} \vspace{2cm} \begin{itemize} \setlength\itemsep{0.6cm} \item \emph{Containers} or \emph{Virtual Machines} are a \alert{binary black box}. \item This template \alert{installs fixed versions} of all necessary research software and their dependencies. \item Installs similar environment on \alert{GNU/Linux}, or \alert{macOS} systems. \item Works very much like a package manager (e.g., \alert{\texttt{apt}} or \alert{\texttt{brew}}). \end{itemize} \column{5cm} \includegraphics[width=\linewidth]{img/version-highlighted.png} \end{columns} \end{frame} \begin{frame}{Controlled environment and build instructions} \small \begin{columns} \column{5.5cm} \includegraphics[width=0.9\linewidth]{img/env.png} \column{5.5cm} \includegraphics[width=0.9\linewidth]{img/build.png} \end{columns} \end{frame} \begin{frame}{Controlled environment and build instructions} \small \begin{columns} \column{5.5cm} \includegraphics[width=0.9\linewidth]{img/env-highlighted.png} \column{5.5cm} \includegraphics[width=0.9\linewidth]{img/build-highlighted.png} \end{columns} \end{frame} \newcommand{\prjdir}{\textcolor{gray}{/PROJECT}} \newcommand{\lcolor}[1]{\textcolor{green!80!black}{#1}} \begin{frame}{All high-level dependencies are under control (e.g., NoiseChisel's dependencies)} \scriptsize \begin{columns} \column{8cm} \begin{center} {\large \textbf{GNU/Linux distribution}} \end{center} \texttt{\$ ldd .local/bin/astnoisechisel}\\ \hspace{0.5cm}\texttt{\lcolor{libgnuastro.so.7} => \textcolor{gray}{\prjdir}/libgnuastro.so.7 (0x00007f6745f39000)}\\ \hspace{0.5cm}\texttt{\lcolor{libgit2.so.26} => \prjdir/libgit2.so.26 (0x00007f6745df1000)}\\ \hspace{0.5cm}\texttt{\lcolor{libtiff.so.5} => \prjdir/libtiff.so.5 (0x00007f6745d77000)}\\ \hspace{0.5cm}\texttt{\lcolor{liblzma.so.5} => \prjdir/liblzma.so.5 (0x00007f6745d4f000)}\\ \hspace{0.5cm}\texttt{\lcolor{libjpeg.so.9} => \prjdir/libjpeg.so.9 (0x00007f6745d12000)}\\ \hspace{0.5cm}\texttt{\lcolor{libwcs.so.6} => \prjdir/libwcs.so.6 (0x00007f6745ba8000)}\\ \hspace{0.5cm}\texttt{\lcolor{libcfitsio.so.8} => \prjdir/libcfitsio.so.8 (0x00007f674588b000)}\\ \hspace{0.5cm}\texttt{\lcolor{libcurl.so.4} => \prjdir/libcurl.so.4 (0x00007f6745811000)}\\ \hspace{0.5cm}\texttt{\lcolor{libssl.so.1.1} => \prjdir/libssl.so.1.1 (0x00007f6745777000)}\\ \hspace{0.5cm}\texttt{\lcolor{libcrypto.so.1.1} => \prjdir/libcrypto.so.1.1 (0x00007f6745491000)}\\ \hspace{0.5cm}\texttt{\lcolor{libz.so.1} => \prjdir/libz.so.1 (0x00007f6745474000)}\\ \hspace{0.5cm}\texttt{\lcolor{libgsl.so.23} => \prjdir/libgsl.so.23 (0x00007f67451e3000)}\\ \hspace{0.5cm}\texttt{\lcolor{libgslcblas.so.0} => \prjdir/libgslcblas.so.0 (0x00007f67451a1000)}\\ \hspace{0.5cm}\texttt{\textcolor{blue}{libpthread.so.0} => /usr/lib/libpthread.so.0 (0x00007f6745006000)}\\ \hspace{0.5cm}\texttt{\textcolor{blue}{libm.so.6} => /usr/lib/libm.so.6 (0x00007f6745027000)}\\ \hspace{0.5cm}\texttt{\textcolor{blue}{libc.so.6} => /usr/lib/libc.so.6 (0x00007f6744e43000)}\\ \hspace{0.5cm}\texttt{\alert{libdl.so.2} => /usr/lib/libdl.so.2 (0x00007f6744e1e000)}\\ \hspace{0.5cm}\texttt{\alert{librt.so.1} => /usr/lib/librt.so.1 (0x00007f6744e36000)}\\ \hspace{0.5cm}\texttt{\alert{linux-vdso.so.1} (0x00007fffdcbf7000)}\\ \hspace{0.5cm}\texttt{\alert{/lib64/ld-linux-x86-64.so.2} => /usr/lib64/ld-linux-x86-64.so.2} \column{7.5cm} \begin{center} {\large \textbf{macOS}} \end{center} \texttt{\$ otool -L .local/bin/astnoisechisel}\\ \hspace{0.5cm}\texttt{\prjdir/\lcolor{libgnuastro.7.dylib} (comp ver 8.0.0, cur ver 8.0.0)}\\ \hspace{0.5cm}\texttt{\prjdir/\lcolor{libgit2.26.dylib} (comp ver 26.0.0, cur ver 0.26.0)}\\ \hspace{0.5cm}\texttt{\prjdir/\lcolor{libtiff.5.dylib} (comp ver 10.0.0, cur ver 10.0.0)}\\ \hspace{0.5cm}\texttt{\prjdir/\lcolor{liblzma.5.dylib} (comp ver 8.0.0, cur ver 8.4.0)}\\ \hspace{0.5cm}\texttt{\prjdir/\lcolor{libjpeg.9.dylib} (comp ver 12.0.0, cur ver 12.0.0)}\\ \hspace{0.5cm}\texttt{\prjdir/\lcolor{libwcs.6.2.dylib} (comp ver 6.0.0, cur ver 6.2.0)}\\ \hspace{0.5cm}\texttt{\prjdir/\lcolor{libcfitsio.8.dylib} (comp ver 8.0.0, cur ver 8.3.47)}\\ \hspace{0.5cm}\texttt{\prjdir/\lcolor{libcurl.4.dylib} (comp ver 10.0.0, cur ver 10.0.0)}\\ \hspace{0.5cm}\texttt{\prjdir/\lcolor{libssl.1.1.dylib} (comp ver 1.1.0, cur ver 1.1.0)}\\ \hspace{0.5cm}\texttt{\prjdir/\lcolor{libcrypto.1.1.dylib} (comp ver 1.1.0, cur ver 1.1.0)}\\ \hspace{0.5cm}\texttt{\prjdir/\lcolor{libz.1.dylib} (comp ver 1.0.0, cur ver 1.2.11)}\\ \hspace{0.5cm}\texttt{\prjdir/\lcolor{libgsl.23.dylib} (comp ver 25.0.0, cur ver 25.0.0)}\\ \hspace{0.5cm}\texttt{\prjdir/\lcolor{libgslcblas.0.dylib} (comp ver 1.0.0, cur ver 1.0.0)}\\ \hspace{0.5cm}\alert{/usr/lib/libSystem.B.dylib} (comp ver 1.0.0, cur ver 1252.50.4) \vspace{1.4cm} \end{columns} \vspace{2mm} \begin{tcolorbox}[boxsep=0pt,left=1mm,right=1mm,top=1mm,bottom=1mm] \small \lcolor{Project libraries:} High-level libraries built for each project. \textcolor{blue}{GNU C Library:} Currently not installed, will be available on GNU/Linux systems soon. \alert{System/linker libraries}: Very low-level, we do not need to control. \end{tcolorbox} \end{frame} \begin{frame}{Advantages of this build system} \begin{columns} \column{11cm} \begin{itemize} \setlength\itemsep{0.7cm} \item Project runs in fixed/controlled environment: custom build of \alert{Bash}, \alert{Make}, GNU Coreutils (\alert{\texttt{ls}}, \alert{\texttt{cp}}, \alert{\texttt{mkdir}} and etc), \alert{AWK}, or \alert{SED}, \alert{\LaTeX}, etc. \item No need for \alert{root}/administrator \alert{permissions} (on servers or super computers). \item Whole system is built \alert{automatically} on any Unix-like operating system \\(less 2 hours). \item Dependencies of different projects will \alert{not conflict}. \item Everything in \alert{plain text} (human \& computer readable/archivable). \end{itemize} \column{4cm} \includegraphics[width=\linewidth]{img/unchained.jpg}\\ \tiny \url{https://natemowry2.wordpress.com} \end{columns} \end{frame} \begin{frame}{Software citation automatically generated in paper (including Astropy)} \centering \includegraphics[width=0.8\linewidth]{img/software-cite.jpg} \end{frame} \begin{frame}{Software citation automatically generated in paper (including Astropy)} \centering \includegraphics[width=0.8\linewidth]{img/software-cite-highlighted.jpg} \end{frame} \begin{frame}{Software citation automatically generated in paper (only GNU Astronomy Utilities)} \centering \includegraphics[width=0.4\linewidth]{img/software-cite-no-py.jpg} \end{frame} \begin{frame}{Software citation automatically generated in paper (only GNU Astronomy Utilities)} \centering \includegraphics[width=0.4\linewidth]{img/software-cite-no-py-highlighted.jpg} \end{frame} %% Hardware/data \newcommand{\focusonhardware}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \let\focusonhardware\undefined \begin{frame}{Input data source and integrity is documented and checked} \small \begin{columns} \column{10cm} Stored information about each input file: \begin{itemize} \item \alert{PID} (where available). \item Download \alert{URL}. \item \alert{MD5}-sum to check integrity. \end{itemize} \vspace{1cm} All inputs are \alert{downloaded} from the given PID/URL when necessary\\(during the analysis). \vspace{1cm} MD5-sums are \alert{checked} to make sure the download was done properly or the file is the same (hasn't changed on the server/source). \column{5cm} \includegraphics[width=\linewidth]{img/inputs.png} \end{columns} \end{frame} \begin{frame}{Input data source and integrity is documented and checked} \small \begin{columns} \column{10cm} Stored information about each input file: \begin{itemize} \item \alert{PID} (where available). \item Download \alert{URL}. \item \alert{MD5}-sum to check integrity. \end{itemize} \vspace{1cm} All inputs are \alert{downloaded} from the given PID/URL when necessary\\(during the analysis). \vspace{1cm} MD5-sums are \alert{checked} to make sure the download was done properly or the file is the same (hasn't changed on the server/source). \column{5cm} \includegraphics[width=\linewidth]{img/inputs-highlighted.png} \end{columns} \end{frame} %% Analysis \newcommand{\focusonrun}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \let\focusonrun\undefined \begin{frame}{Reproducible science: Template is managed through a Makefile} \small \begin{columns} \column{10cm} All steps (downloading and analysis) are managed by Makefiles\\ (example from \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}): \vspace{5mm} \begin{itemize} \setlength\itemsep{0.7cm} \item Unlike a script which always starts from the top, a Makefile \alert{starts from the end} and steps that don't change will be left untouched (not remade). \item A single \emph{rule} can \alert{manage any number of files}. \item Make can identify independent steps internally and do them in \alert{parallel}. \item Make was \alert{designed for complex projects} with thousands of files (all major Unix-like components), so it is highly evolved and efficient. \item Make is a very \alert{simple} and \alert{small} language, thus easy to learn with great and free documentation (for example \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU Make's manual}}). \end{itemize} \column{5cm} \includegraphics[width=\linewidth]{img/reproducible-makefile.png} \end{columns} \end{frame} \newcommand{\focusonpaper}{} \begin{frame}{General outline of a project} \include{tex/project-graph} \end{frame} \let\focusonpaper\undefined \begin{frame}{Values in final report/paper} All analysis \alert{results} (numbers, plots, tables) written in paper's PDF as \alert{\LaTeX{} macros}. They are thus \alert{updated automatically} on any change.\\ Shown here is a portion of the \textsf{NoiseChisel} paper and its \LaTeX{} source (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). \vspace{0.4cm} \includegraphics[width=\linewidth]{img/reproducible-latex.png} \end{frame} \begin{frame}{Values in final report/paper} All analysis \alert{results} (numbers, plots, tables) written in paper's PDF as \alert{\LaTeX{} macros}. They are thus \alert{updated automatically} on any change.\\ Shown here is a portion of the \textsf{NoiseChisel} paper and its \LaTeX{} source (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). \vspace{0.4cm} \includegraphics[width=\linewidth]{img/reproducible-latex-highlighted.png} \end{frame} \begin{frame}{Analysis step results/values concatenated into a single file.} All \LaTeX{} macros come from a \alert{single file}. \begin{center} \includegraphics[width=0.6\linewidth]{img/reproducible-macros.png} \end{center} \end{frame} \begin{frame}{Analysis step results/values concatenated into a single file.} All \LaTeX{} macros come from a \alert{single file}. \begin{center} \includegraphics[width=0.6\linewidth]{img/reproducible-macros-highlighted.png} \end{center} \end{frame} \begin{frame}{Analysis results stored as \LaTeX{} macros} The analysis scripts write/update the \LaTeX{} macro values automatically. \begin{center} \includegraphics[width=0.6\linewidth]{img/reproducible-write-macro.png} \end{center} \end{frame} \begin{frame}{Analysis results stored as \LaTeX{} macros} The analysis scripts write/update the \LaTeX{} macro values automatically. \begin{center} \includegraphics[width=0.6\linewidth]{img/reproducible-write-macro-highlight.png} \end{center} \end{frame} \newcommand{\allopacity}{1} \begin{frame}{Everything in plain text (machine and human readable)} \include{tex/project-graph} \end{frame} \newcommand{\gitlogo}{} \begin{frame}{Everything in plain text (machine and human readable)} \include{tex/project-graph} \end{frame} \begin{frame}{New projects branch from template} \include{tex/git-branch} \end{frame} \newcommand{\projinit}{} \begin{frame}{New projects branch from template} \include{tex/git-branch} \end{frame} \newcommand{\projwork}{} \begin{frame}{New projects branch from template} \include{tex/git-branch} \end{frame} \newcommand{\tempevolve}{} \begin{frame}{New projects branch from template} \include{tex/git-branch} \end{frame} \newcommand{\mergewithtemp}{} \begin{frame}{New projects branch from template} \include{tex/git-branch} \end{frame} \newcommand{\tofuture}{} \begin{frame}{New projects branch from template} \include{tex/git-branch} \end{frame} \newcommand{\githappy}{} \begin{frame}{New projects branch from template} \include{tex/git-branch} \end{frame} \begin{frame}{Project source and its execution} \begin{tcolorbox} Programs \textcolor{gray}{[here: Scientific projects]} must be written for \alert{people to read}... \hfill ...and only \emph{incidentally} for machines to \emph{execute}. \vspace{2mm} \hfill \footnotesize Harold Abelson, Structure and Interpretation of Computer Programs \end{tcolorbox} \end{frame} \begin{frame}{Publication of the project} A reproducible project using this template will have the following (\alert{plain text}) components: \begin{itemize} \item Makefiles. \item \LaTeX{} source files. \item Configuration files for software used in analysis. \item Scripts/programming files (e.g., Python, Shell, AWK, C). \end{itemize} The \alert{volume} of the project's source will thus be \alert{negligible} compared to a single figure in a paper (usually $\sim100$ kilo-bytes). \vspace{1cm} The project's pipeline (customized template) can be \alert{published} in \begin{itemize} \item \alert{arXiv}: uploaded with the \TeX{} source to always stay with the paper \\(for example \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). The file containing all macros must also be uploaded so arXiv's server can easily build the \LaTeX{} source. \item \alert{Zenodo}: Along with all the input datasets (many Gigabytes) and software \\(for example \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}) and given a unique DOI. \end{itemize} \end{frame} \begin{frame}{Future prospects...} \large Adoption of reproducibility by many researchers will enable the following: \vspace{1em} \begin{itemize} \setlength\itemsep{3mm} \item A repository for education/training \textcolor{gray}{(PhD students, or researchers in other fields)}. \item Easy \alert{verification}/\alert{understanding} of other research projects \textcolor{gray}{(when necessary)}. \item Trivially \alert{test} different steps of others' work \textcolor{gray}{(different configurations, software and etc)}. \item Science can progress \alert{incrementally} \textcolor{gray}{(shorter papers actually building on each other!)}. \item \alert{Extract meta-data} after the publication of a dataset (for future ontologies or vocabularies). \item Applying \alert{machine learning} on reproducible research projects will allow us to solve some Big Data Challenges: \vspace{1em} \begin{itemize} \setlength\itemsep{2mm} \item \emph{Extract the relevant parameters automatically}. \item \emph{Translate the science to enormous samples}. \item \emph{Believe the results when no one will have time to reproduce}. \item \emph{Have confidence in results derived using machine learning or AI}. \end{itemize} \end{itemize} \end{frame} \begin{frame}{GOOD NEWS: RDA adoption grant to IAC for this template} \begin{center} \includegraphics[width=3cm]{img/rda.png}\hspace{1cm} \includegraphics[width=1.8cm]{img/iac.png} \includegraphics[width=\linewidth]{img/h2020.jpg} \end{center} \vspace{1cm} For this template, the \alert{IAC} is selected as a \alert{Top European organization} funded to adopt RDA Recommendations and Outputs. \vspace{1cm} \scriptsize \begin{itemize} \item Research Data Alliance was launched by the \alert{European Commission}, NSF, National Institute of Standards and Technology, and the Australian Government’s Department of Innovation. \item RDA Outputs are the technical and social infrastructure solutions developed by RDA Working Groups or Interest Groups that enable data sharing, exchange, and interoperability. \end{itemize} \vspace{0.2cm} \centering \end{frame} \begin{frame}{Summary:} A fully working template/framework is introduced that will do the following steps/instructions (all in simple plain text files). \begin{itemize} \item \alert{Automatically downloads} the necessary \emph{software} and \emph{data}. \item \alert{Builds} the software in a \alert{closed environment}. \item Runs the software on data to \alert{generate} the final \alert{research results}. \item A modification in one part of the analysis will only result in re-doing that part, not the whole project. \item Using LaTeX macros, paper's figures, tables and numbers will be \alert{Automatically updated} after a change in analysis. Allowing the scientist to focus on the scientific interpretation. \item The whole project is under \alert{version control} (Git) to allow easy reversion to a previous state. This \alert{encourages tests/experimentation} in the analysis. \item The \alert{Git commit hash} of the project source, is \alert{printed} in the published paper and \alert{saved on output} data products. Ensuring the integrity/reproducibility of the result. \item \colorbox{green!30!white}{These slides are available at \textcolor{blue}{\url{http://akhlaghi.org/pdf/reproducible-paper.pdf}}.} \end{itemize} \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, top=1pt, bottom=1pt] For a technical description of the template's implementation, as well as a checklist to customize it, and tips on good practices, please see this page: \textcolor{blue}{\footnotesize\url{https://gitlab.com/makhlaghi/reproducible-paper/blob/master/README-hacking.md}} \end{tcolorbox} \end{frame} \end{document}