% LaTeX source of slides on reproducible paper. % % Copyright (C) 2020-2022 Mohammad Akhlaghi % % This LaTeX source is free software: you can redistribute it and/or % modify it under the terms of the GNU General Public License as % published by the Free Software Foundation, either version 3 of the % License, or (at your option) any later version. % % This LaTeX source is distributed in the hope that it will be useful, % but WITHOUT ANY WARRANTY; without even the implied warranty of % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU % General Public License for more details. % % You should have received a copy of the GNU General Public License % along with this LaTeX source. If not, see . % Basic LaTeX settings. \documentclass[9pt,usenames,dvipsnames,aspectratio=169]{beamer} % Make it super short. \newcommand{\longformat}{} % Read the current Git commit information \include{git-commit} \include{tex/preamble} %% Beamer settings. %\setbeamertemplate{footline}[frame number] %% Packages to import. \usepackage{tcolorbox} %For a color-box. \usepackage{textcomp} %For a copyright sign. %% To simplify arXiv links \newcommand{\arxivlink}[1]{{\footnotesize (\textcolor{blue}{\href{https://arxiv.org/abs/#1}{arXiv:#1}})}} %% Set the title \title{\huge\textbf{BIG} Data, \textbf{BIG} responsibility \\\vspace{2mm} \large Maneage: Managing data lineage for long-term and archivable reproducibility \\\vspace{1mm} \footnotesize (Published in CiSE 23 (3), pp 82-91: \textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2021.3072860}{DOI:10.1109/MCSE.2021.3072860}}, \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}})} %% Set the author \author{\\ \href{https://akhlaghi.org}{Mohammad Akhlaghi}\\\vspace{0.5mm} \footnotesize Centro de Estudios de F\'isica del Cosmos de Arag\'on ({\scriptsize CEFCA}), Teruel, Spain\vspace{8mm} } %% Set the date and insitutional logos. \date{\footnotesize\vspace{-9mm}\\ \textcolor{black}{Royal Observatory Coffee talk; Edinburgh}\\ \textcolor{black}{23rd of May 2023} \\ \tiny\vspace{10mm} Most recent slides available in link below (this PDF is built from \href{http://git.maneage.org/slides-intro.git}{Git commit} \gitcommit):\\ \footnotesize\textcolor{blue}{\url{https://maneage.org/pdf/slides-intro-short.pdf}}\\ \vspace{2mm} \raisebox{+0.2\height}{\includegraphics[width=3.5cm]{img/jcava.jpg}} \includegraphics[width=9.5mm]{img/cefca.png} \includegraphics[width=1.2cm]{img/iac.png} \raisebox{0.13\height}{\includegraphics[width=1cm]{img/eu-regional.png}} \raisebox{0.05\height}{\includegraphics[width=1cm]{img/eu-rdaeu4.png}} \raisebox{+0.1\height}{\includegraphics[width=1.4cm]{img/rda-europe.png}} \raisebox{+1\height}{\includegraphics[width=1.5cm]{img/aragon.png}} \raisebox{+0.8\height}{\includegraphics[width=1.6cm]{img/gobierno-canarias.png}}\\ \vspace{1cm} } \begin{document} \begin{frame} \titlepage \end{frame} \usebackgroundtemplate{ } %% undeclare it %% Introduction to OAJ and J-PAS \begin{frame}{Our main project: \textbf{J-PAS} with Observatorio Astrof\'isico de Javalambre (OAJ)} J-PAS will observe the northern sky in \alert{56 medium-band filters} ($\sim14$nm): \begin{center} \includegraphics[width=0.9\linewidth]{img/oaj.jpg} \end{center} \end{frame} \begin{frame}{LSST filter: 6 {\footnotesize(image from \href{https://speclite.readthedocs.io/en/latest/filters.html}{speclite docs})}:} \begin{center} \vspace{-3mm} \includegraphics[width=0.8\linewidth]{img/filters-lsst.png} \end{center} \end{frame} \begin{frame}{J-PAS filters: 56 (Bonoli+2021: \href{https://ui.adsabs.harvard.edu/abs/2021A\%26A...653A..31B}{2021A\&A...653A..31B})} \begin{center} \includegraphics[width=\linewidth]{img/filters-jpas.pdf} \end{center} \end{frame} \begin{frame}{Result: photo-\alert{spectra} of \alert{every pixel} of the non-Galactic northern sky (like an IFU)!} \url{http://archive.cefca.es/catalogues/minijpas-pdr201912/navigator.html} \vspace{3mm} \includegraphics[width=\linewidth]{img/minijpas-web.png} \end{frame} \newcommand{\allopacity}{1} \ifdefined\longformat \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} \fi \newcommand{\paperinit}{} \ifdefined\longformat \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} \fi \newcommand{\sver}{} \newcommand{\srep}{} \newcommand{\dver}{} \newcommand{\ddver}{} \newcommand{\confopt}{} \newcommand{\confenv}{} \newcommand{\containers}{} \newcommand{\db}{} \newcommand{\calib}{} \newcommand{\corr}{} \newcommand{\runord}{} \newcommand{\runopt}{} \newcommand{\humanerr}{} \newcommand{\confirmbias}{} \newcommand{\depupdate}{} \newcommand{\coauth}{} \newcommand{\varsinpaper}{} \newcommand{\recordinfo}{} \newcommand{\softcite}{} \newcommand{\prevchange}{} \newcommand{\paperfinal}{} \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} %% Don't show the happy scientist or the existing containers box. \let\paperinit\undefined \let\allopacity\undefined \let\paperfinal\undefined \let\containers\undefined \begin{frame}{Science is a tricky business} \begin{tcolorbox}[boxsep=0pt,left=1mm,right=1mm,top=1mm,bottom=1mm] \large Data analysis [...] is a \alert{human behavior}. Researchers who hunt hard enough will turn up a result that fits statistical criteria, but their \alert{discovery} will probably be a \alert{false positive}. \vspace{3mm} \small \hfill Five ways to fix statistics (Nature, 551, Nov 2017; DOI:\textcolor{blue}{\href{https://doi.org/10.1038/d41586-017-07522-z}{10.1038/d41586-017-07522-z}}). \end{tcolorbox} \end{frame} \begin{frame}{``Reproducibility crisis'' in the sciences? (Baker 2016, Nature 533, 452, \textcolor{blue}{\href{https://doi.org/10.1038/533452a}{DOI:10.1038/533452a}})} \Large 1576 researchers participated in a survey by Nature, \alert{$90\%$} believed in a crisis! \vspace{7mm} \begin{center} \begin{tabular}{ |l|r| } \hline Status & $\%$ agreed \\ \hline \alert{Yes}, a significant crisis & \textcolor{red}{$52$} \\ \alert{Yes}, a slight crisis & \textcolor{red}{$38$} \\ Don't know & $7$ \\ No, there is no crisis & $3$ \\ \hline \end{tabular} \end{center} \vspace{7mm} Full PDF available at \textcolor{blue}{\url{https://www.nature.com/articles/533452a.pdf}} \end{frame} \begin{frame}{Notebooks are not long-term solutions {\small (see appendices of Akhlaghi+2021: \href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018})}} \begin{columns} \column{0.4\linewidth} \includegraphics[width=\linewidth]{img/dependencies-conda-initial.png} \column{0.4\linewidth} \includegraphics[width=0.9\linewidth]{img/dependencies-jupyter.png} \column{0.2\linewidth} Results from run on May 10th, 2022: \pause \vspace{7mm} Conda setup:\\\alert{39 dependencies} \pause \vspace{7mm} Jupyter (with Pip):\\\alert{61 dependencies} \pause \vspace{7mm} Web browser has more dependencies; with fluid/\alert{evolving} web technologies. \pause \vspace{7mm} They can contain \alert{binary} components. \end{columns} \end{frame} \begin{frame}{The dependency tree (Matplotlib is \emph{only one} dependency of Jupyter)} \Wider[5em]{ %\vspace{5mm} \begin{center} \includegraphics[width=0.9\linewidth]{img/matplotlib.png} \end{center} \vspace{3mm}\tiny From ``Attributing and Referencing (Research) Software: Best Practices and Outlook from Inria'' (Alliez et al. 2020, CiSE, DOI:\textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2019.2949413}{10.1109/MCSE.2019.2949413}}). } \end{frame} \begin{frame}{Are containers the solution? Yes, but ... for the short term} \pause \begin{itemize} \setlength\itemsep{5mm} \item Containers are \alert{large} (many giga-bytes) \begin{itemize} \setlength\itemsep{3mm} \pause \item \alert{Expensive} to archive! \pause \item Example: \textcolor{blue}{\href{https://is.ieis.tue.nl/staff/pvgorp/share}{SHARE}} (enabling remote connection to Virtual machines with project environment): \begin{itemize} \setlength\itemsep{2mm} \item \alert{2nd place} in Elsevier's ``Executable paper grand challenge'' of 2011. \item SHARE's image repository was taken offline in 2019! \item Even the challenge webpage is no longer available: \textcolor{blue}{\href{http://www.executablepapers.com}{http://www.executablepapers.com}} \end{itemize} \end{itemize} \pause \item Container are \alert{binary} (tailored to certain kernels+CPUs) \begin{itemize} \setlength\itemsep{3mm} \pause \item Only guarantee the Long Term Release kernels. \begin{itemize} \setlength\itemsep{2mm} \item Become un-readable, multi-gigabyte binary blobs in $\sim10$ years! \item Even if you store them on Zenodo! \end{itemize} \pause \item Only on common CPUs architectures. \end{itemize} \pause \item Containers \alert{themselves} are \alert{hard to reproduce}. \begin{itemize} \item Example: \textcolor{blue}{\href{https://ui.adsabs.harvard.edu/abs/2020CSE....22a.102M}{2020CSE....22a.102M}} use `\texttt{FROM ubuntu:16.04}', but if run today, \textcolor{blue}{\href{https://partner-images.canonical.com/core/xenial}{images are from 2021}}. \end{itemize} \end{itemize} \end{frame} \begin{frame} \Large For \alert{longevity issues} with Jupyter, Conda, Containers and etc ... \vspace{3mm} As well as a survey of \alert{depreciated}/\alert{abandoned}/\alert{lost} solutions since the \alert{1990s} ... \vspace{5mm} \hfill ... see the appendices in \textcolor{blue}{\href{https://arxiv.org/pdf/2006.03018.pdf}{arXiv:2006.03018}} \end{frame} \begin{frame}{Our solution: CiSE 23 (3), pp 82-91: \textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2021.3072860}{DOI:10.1109/MCSE.2021.3072860}}, \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}} \begin{columns} \column{0.4\linewidth} \includegraphics[width=\linewidth]{img/maneage-paper.png} \column{0.6\linewidth} \includegraphics[width=\linewidth]{img/maneage-webpage.png} \begin{center} \huge{https://maneage.org} \end{center} \end{columns} \end{frame} \begin{frame}{Recognition 1: RDA adoption grant (2019) to IAC for Maneage} \begin{center} \includegraphics[width=3cm]{img/rda.png}\hspace{1cm} \includegraphics[width=1.8cm]{img/iac.png} \includegraphics[width=\linewidth]{img/h2020.jpg} \end{center} \vspace{1cm} For Maneage, the \alert{IAC} is selected as a \alert{Top European organization} funded to adopt RDA Recommendations and Outputs. \vspace{1cm} \scriptsize \begin{itemize} \item Research Data Alliance was launched by the \alert{European Commission}, NSF, National Institute of Standards and Technology, and the Australian Government’s Department of Innovation. \item RDA Outputs are the technical and social infrastructure solutions developed by RDA Working Groups or Interest Groups that enable data sharing, exchange, and interoperability. \end{itemize} \vspace{0.2cm} \end{frame} \begin{frame}{Recognition 2: ``News and Views'' in Nature Astronomy (\textcolor{blue}{\href{https://doi.org/10.1038/s41550-021-01402-3}{DOI:10.1038/s41550-021-01402-3}})} \begin{center} \includegraphics[width=0.8\linewidth]{img/nature-astronomy.png} \end{center} \vspace{-2mm} \footnotesize Free-to-read link: \textcolor{blue}{\url{https://rdcu.be/cmYVx}} \end{frame} \begin{frame}[t]{Definitions \& Clarification \hspace{1.6cm} {\normalsize(from the National Academies report in 2019, \href{http://doi.org/10.17226/25303}{DOI:10.17226/25303})}} \vspace{-5mm} \begin{columns}[t] \column{0.5\linewidth} \begin{center} \large\textbf{Replicability (hardware/statistical)} \rule{0.5\linewidth}{1pt} \end{center} \begin{itemize} \setlength\itemsep{0.5em} \item Involves data \alert{collection}. \item Inherently includes \alert{measurements errors}\\(can never be exactly reproduced). \item Example: Raw telescope image/spectra. \item \alert{\textbf{NOT DISCUSSED HERE.}} \end{itemize} \vspace{3.5mm} \begin{center} \vspace{-5mm} \includegraphics[width=0.7\linewidth]{img/hale-prime-focus.jpg}\\ \vspace{-0.6mm} \tiny \href{http://slittlefair.staff.shef.ac.uk/teaching/phy217/lectures/telescopes/L07/index.html}{http://slittlefair.staff.shef.ac.uk} \end{center} \column{0.5\linewidth} \end{columns} \end{frame} \begin{frame}[t]{Definitions \& Clarification \hspace{1.6cm} {\normalsize(from the National Academies report in 2019, \href{http://doi.org/10.17226/25303}{DOI:10.17226/25303})}} \vspace{-5mm} \begin{columns}[t] \column{0.5\linewidth} \begin{center} \large\textbf{Replicability (hardware/statistical)} \rule{0.5\linewidth}{1pt} \end{center} \begin{itemize} \setlength\itemsep{0.5em} \item Involves data \alert{collection}. \item Inherently includes \alert{measurements errors}\\(can never be exactly reproduced). \item Example: Raw telescope image/spectra. \item \alert{\textbf{NOT DISCUSSED HERE.}} \end{itemize} \vspace{3.5mm} \begin{center} \vspace{-5mm} \includegraphics[width=0.7\linewidth]{img/hale-prime-focus-marked.jpg}\\ \vspace{-0.6mm} \tiny \href{http://slittlefair.staff.shef.ac.uk/teaching/phy217/lectures/telescopes/L07/index.html}{http://slittlefair.staff.shef.ac.uk} \end{center} \column{0.5\linewidth} \end{columns} \end{frame} \begin{frame}[t]{Definitions \& Clarification \hspace{1.6cm} {\normalsize(from the National Academies report in 2019, \href{http://doi.org/10.17226/25303}{DOI:10.17226/25303})}} \vspace{-5mm} \begin{columns}[t] \column{0.5\linewidth} \begin{center} \large\textbf{Replicability (hardware/statistical)} \rule{0.5\linewidth}{1pt} \end{center} \begin{itemize} \setlength\itemsep{0.5em} \item Involves data \alert{collection}. \item Inherently includes \alert{measurements errors}\\(can never be exactly reproduced). \item Example: Raw telescope image/spectra. \item \alert{\textbf{NOT DISCUSSED HERE.}} \end{itemize} \vspace{3.5mm} \begin{center} \vspace{-5mm} \includegraphics[width=0.7\linewidth]{img/hale-prime-focus.jpg}\\ \vspace{-0.6mm} \tiny \href{http://slittlefair.staff.shef.ac.uk/teaching/phy217/lectures/telescopes/L07/index.html}{http://slittlefair.staff.shef.ac.uk} \end{center} \column{0.5\linewidth} \begin{center} \large\textbf{Reproducibility (Software/Deterministic)} \rule{0.5\linewidth}{1pt} \end{center} \begin{itemize} \setlength\itemsep{1em} \item Involves data \alert{analysis}, or simulations. \item Starts \alert{after} data is collected/digitized. \item Example: $2+2=4$ (i.e., sum of datasets). \item \textbf{\textcolor{green!50!black}{DISCUSSED HERE.}} \end{itemize} \centering \vspace{3mm} \includegraphics[width=0.88\linewidth]{img/binary-blue.jpg}\\ \vspace{-0.6mm} \tiny \href{https://commons.wikimedia.org/wiki/File:Binary_blue.jpg}{Wikimedia Commons} \end{columns} \end{frame} \begin{frame}{Founding criteria} \begin{tcolorbox}[title=Basic/simple principle:] \centering Science is defined by its METHOD, \alert{not} its result. \end{tcolorbox} \ifdefined\longformat\pause\fi \begin{itemize} \item \textbf{Complete/self-contained:} \begin{itemize} \item \alert{Only dependency} should be \alert{POSIX} tools \textcolor{gray}{(discards Conda or Jupyter which need Python)}. \item Must \alert{not require root} permissions \textcolor{gray}{(discards tools like Docker or Nix/Guix)}. \item Should be \alert{non-interactive} or runnable in batch (user interaction is an incompleteness). \item Should be usable \alert{without internet} connection. \end{itemize} \ifdefined\longformat\pause\fi \item \textbf{Modularity:} Parts of the project should be \alert{re-usable} in other projects. \ifdefined\longformat\pause\fi \item \textbf{Plain text:} Project's source should be in \alert{plain-text} \textcolor{gray}{(binary formats need special software)} \begin{itemize} \item This includes high-level analysis. \item It is easily publishable (very low volume of $\times100$KB), archivable, and parse-able. \item \alert{Version control} (e.g., with Git) can track project's history. \end{itemize} \ifdefined\longformat\pause\fi \item \textbf{Minimal complexity:} Occum’s rasor: “Never posit pluralities without necessity”. \begin{itemize} \item Avoiding the \alert{fashionable} tool of the day: tomorrow another tool will take its place! \item Easier \alert{learning curve}, also doesn't create a \alert{generational gap}. \item Is \alert{compatible} and \alert{extensible}. \end{itemize} \ifdefined\longformat\pause\fi \item \textbf{Verifable inputs and outputs:} Inputs and Outputs must be \alert{automatically verified}. \ifdefined\longformat\pause\fi \item \textbf{Free and open source software:} \alert{Free software} is essential: non-free software is not configurable, not distributable, and dependent on non-free provider (which may discontinue it in N years). \end{itemize} \end{frame} \newcommand{\focusonpackages}{} \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} \let\focusonpackages\undefined \begin{frame}{Predefined/exact software tools} \small \begin{columns} \column{10cm} \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, top=1pt, bottom=1pt, title=Reproducibility \& software] \footnotesize Reproducing the environment (specific \alert{software versions}, \alert{build instructions} and \alert{dependencies}) is also critically important for reproducibility. \end{tcolorbox} \begin{itemize} \setlength\itemsep{2mm} \item \emph{Containers} or \emph{Virtual Machines} are a \alert{binary black box}. \begin{itemize} \setlength\itemsep{2mm} \item e.g., with `\texttt{FROM ubuntu:16.04}' (released in April 2016), \item in a \texttt{Dockerfile}, the OS image will come from (updated monthly!): \url{https://partner-images.canonical.com/core/xenial} \end{itemize} \item Maneage \alert{installs fixed versions} of all necessary research software. \begin{itemize} \item Including their dependencies. \item All the way down to the C compiler. \end{itemize} \item Installs similar environment on \alert{GNU/Linux}, or \alert{macOS} systems. \item Works like a package manager (e.g., \alert{\texttt{apt}}, \alert{\texttt{brew}} or Conda). \begin{itemize} \item ... \alert{but (!)}, its not a third party package manager. \item Build instructions are within same analysis project. \item e.g., see Conda's build of Gnuastro (its gets updated behind your back): \url{https://anaconda.org/conda-forge/gnuastro/files} \end{itemize} \item Source code of all software in Maneage is archived on \textcolor{blue}{\href{https://doi.org/10.5281/zenodo.3883409}{zenodo.3883409}}. \end{itemize} \column{5cm} \includegraphics[width=\linewidth]{img/version.png} \end{columns} \end{frame} \begin{frame}{Predefined/exact software tools} \small \begin{columns} \column{10cm} \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, top=1pt, bottom=1pt, title=Reproducibility \& software] \footnotesize Reproducing the environment (specific \alert{software versions}, \alert{build instructions} and \alert{dependencies}) is also critically important for reproducibility. \end{tcolorbox} \begin{itemize} \setlength\itemsep{2mm} \item \emph{Containers} or \emph{Virtual Machines} are a \alert{binary black box}. \begin{itemize} \setlength\itemsep{2mm} \item e.g., with `\texttt{FROM ubuntu:16.04}' (released in April 2016), \item in a \texttt{Dockerfile}, the OS image will come from (updated monthly!): \url{https://partner-images.canonical.com/core/xenial} \end{itemize} \item Maneage \alert{installs fixed versions} of all necessary research software. \begin{itemize} \item Including their dependencies. \item All the way down to the C compiler. \end{itemize} \item Installs similar environment on \alert{GNU/Linux}, or \alert{macOS} systems. \item Works like a package manager (e.g., \alert{\texttt{apt}}, \alert{\texttt{brew}} or Conda). \begin{itemize} \item ... \alert{but (!)}, its not a third party package manager. \item Build instructions are within same analysis project. \item e.g., see Conda's build of Gnuastro (its gets updated behind your back): \url{https://anaconda.org/conda-forge/gnuastro/files} \end{itemize} \item Source code of all software in Maneage is archived on \textcolor{blue}{\href{https://doi.org/10.5281/zenodo.3883409}{zenodo.3883409}}. \end{itemize} \column{5cm} \includegraphics[width=\linewidth]{img/version-highlighted.png} \end{columns} \end{frame} \begin{frame}{Advantages of this build system} \begin{columns} \column{11cm} \begin{itemize} \setlength\itemsep{0.7cm} \item Project runs in fixed/controlled environment: custom build of \alert{Bash}, \alert{Make}, GNU Coreutils (\alert{\texttt{ls}}, \alert{\texttt{cp}}, \alert{\texttt{mkdir}} and etc), \alert{AWK}, or \alert{SED}, \alert{\LaTeX}, etc. \item No need for \alert{root}/administrator \alert{permissions} (on servers or super computers). \item Whole system is built \alert{automatically} on any Unix-like operating system \\(less 2 hours). \item Dependencies of different projects will \alert{not conflict}. \item Everything in \alert{plain text} (human \& computer readable/archivable). \end{itemize} \column{4cm} \includegraphics[width=\linewidth]{img/unchained.jpg}\\ \tiny \url{https://natemowry2.wordpress.com} \end{columns} \end{frame} \ifdefined\longformat \begin{frame}{Software citation automatically generated in paper (including Astropy)} \centering \includegraphics[width=0.8\linewidth]{img/software-cite.jpg} \end{frame} \fi \begin{frame}{Software citation automatically generated in paper (including Astropy)} \centering \includegraphics[width=0.8\linewidth]{img/software-cite-highlighted.jpg} \end{frame} %% Hardware/data \newcommand{\focusonhardware}{} \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} \let\focusonhardware\undefined \ifdefined\longformat \begin{frame}{Input data source and integrity is documented and checked} \small \begin{columns} \column{10cm} Stored information about each input file: \begin{itemize} \item \alert{PID} (where available). \item Download \alert{URL}. \item \alert{MD5}-sum to check integrity. \end{itemize} \vspace{0.75cm} All inputs are \alert{downloaded} from the given PID/URL when necessary\\(during the analysis). \vspace{0.75cm} MD5-sums are \alert{checked} to make sure the download was done properly or the file is the same (hasn't changed on the server/source). \vspace{0.75cm}Example from the reproducible paper \textcolor{blue}{\href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230}}.\\ This paper needs three input files (two images, one catalog). \column{5cm} \includegraphics[width=\linewidth]{img/inputs.png} \end{columns} \end{frame} \fi \begin{frame}{Input data source and integrity is documented and checked} \small \begin{columns} \column{10cm} Stored information about each input file: \begin{itemize} \item \alert{PID} (where available). \item Download \alert{URL}. \item \alert{MD5}-sum to check integrity. \end{itemize} \vspace{0.75cm} All inputs are \alert{downloaded} from the given PID/URL when necessary\\(during the analysis). \vspace{0.75cm} MD5-sums are \alert{checked} to make sure the download was done properly or the file is the same (hasn't changed on the server/source). \vspace{0.75cm}Example from the reproducible paper \textcolor{blue}{\href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230}}.\\ This paper needs three input files (two images, one catalog). \column{5cm} \includegraphics[width=\linewidth]{img/inputs-highlighted.png} \end{columns} \end{frame} %% Analysis \newcommand{\focusonrun}{} \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} \let\focusonrun\undefined \ifdefined\longformat \begin{frame}{Reproducible science: Maneage is managed through a Makefile} \small \begin{columns} \column{10cm} All steps (downloading and analysis) are managed by Makefiles\\ (example from \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}): \vspace{5mm} \begin{itemize} \setlength\itemsep{0.7cm} \item Unlike a script which always starts from the top, a Makefile \alert{starts from the end} and steps that don't change will be left untouched (not remade). \item A single \emph{rule} can \alert{manage any number of files}. \item Make can identify independent steps internally and do them in \alert{parallel}. \item Make was \alert{designed for complex projects} with thousands of files (all major Unix-like components), so it is highly evolved and efficient. \item Make is a very \alert{simple} and \alert{small} language, thus easy to learn with great and free documentation (for example \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU Make's manual}}). \end{itemize} \column{5cm} \includegraphics[width=\linewidth]{img/reproducible-makefile.png} \end{columns} \end{frame} \begin{frame}{Reproducible science: Maneage is managed through a Makefile} \small \begin{columns} \column{10cm} All steps (downloading and analysis) are managed by Makefiles\\ (example from \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}): \vspace{5mm} \begin{itemize} \setlength\itemsep{0.7cm} \item Unlike a script which always starts from the top, a Makefile \alert{starts from the end} and steps that don't change will be left untouched (not remade). \item A single \emph{rule} can \alert{manage any number of files}. \item Make can identify independent steps internally and do them in \alert{parallel}. \item Make was \alert{designed for complex projects} with thousands of files (all major Unix-like components), so it is highly evolved and efficient. \item Make is a very \alert{simple} and \alert{small} language, thus easy to learn with great and free documentation (for example \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU Make's manual}}). \end{itemize} \column{5cm} \includegraphics[width=\linewidth]{img/reproducible-makefile-highlighted-1.png} \end{columns} \end{frame} \fi \begin{frame}{Reproducible science: Maneage is managed through a Makefile} \small \begin{columns} \column{10cm} All steps (downloading and analysis) are managed by Makefiles\\ (example from \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}): \vspace{5mm} \begin{itemize} \setlength\itemsep{0.7cm} \item Unlike a script which always starts from the top, a Makefile \alert{starts from the end} and steps that don't change will be left untouched (not remade). \item A single \emph{rule} can \alert{manage any number of files}. \item Make can identify independent steps internally and do them in \alert{parallel}. \item Make was \alert{designed for complex projects} with thousands of files (all major Unix-like components), so it is highly evolved and efficient. \item Make is a very \alert{simple} and \alert{small} language, thus easy to learn with great and free documentation (for example \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU Make's manual}}). \end{itemize} \column{5cm} \includegraphics[width=\linewidth]{img/reproducible-makefile-highlighted-2.png} \end{columns} \end{frame} \newcommand{\focusonpaper}{} \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} \let\focusonpaper\undefined \ifdefined\longformat \begin{frame}{Values in final report/paper} All analysis \alert{results} (numbers, plots, tables) written in paper's PDF as \alert{\LaTeX{} macros}. They are thus \alert{updated automatically} on any change.\\ Shown here is a portion of the \textsf{NoiseChisel} paper and its \LaTeX{} source (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). \vspace{0.4cm} \includegraphics[width=\linewidth]{img/reproducible-latex.png} \end{frame} \fi \begin{frame}{Values in final report/paper} All analysis \alert{results} (numbers, plots, tables) written in paper's PDF as \alert{\LaTeX{} macros}. They are thus \alert{updated automatically} on any change.\\ Shown here is a portion of the \textsf{NoiseChisel} paper and its \LaTeX{} source (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). \vspace{0.4cm} \includegraphics[width=\linewidth]{img/reproducible-latex-highlighted.png} \end{frame} \ifdefined\longformat \begin{frame}{Analysis step results/values concatenated into a single file.} All \LaTeX{} macros come from a \alert{single file}. \begin{center} \includegraphics[width=0.6\linewidth]{img/reproducible-macros.png} \end{center} \end{frame} \fi \begin{frame}{Analysis step results/values concatenated into a single file.} All \LaTeX{} macros come from a \alert{single file}. \begin{center} \includegraphics[width=0.6\linewidth]{img/reproducible-macros-highlighted.png} \end{center} \end{frame} \ifdefined\longformat \begin{frame}{Analysis results stored as \LaTeX{} macros} The analysis scripts write/update the \LaTeX{} macro values automatically. \begin{center} \includegraphics[width=0.6\linewidth]{img/reproducible-write-macro.png} \end{center} \end{frame} \fi \begin{frame}{Analysis results stored as \LaTeX{} macros} The analysis scripts write/update the \LaTeX{} macro values automatically. \begin{center} \includegraphics[width=0.6\linewidth]{img/reproducible-write-macro-highlight.png} \end{center} \end{frame} %% Make demo. \begin{frame}{Let's look at the data lineage to replicate Figure 1C (green/tool) of Menke+2020 \\(DOI:\href{https://doi.org/10.1101/2020.01.15.908111}{10.1101/2020.01.15.908111}), as done in \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}} for a demo.} \begin{columns} \column{0.55\linewidth} \textcolor{blue}{ORIGINAL PLOT} The Green plot shows the fraction of papers mentioning software tools from 1997 to 2019. \column{0.45\linewidth} \includegraphics[width=\linewidth]{img/tools-per-year-orig.jpg} \end{columns} \rule{\textwidth}{1pt} \begin{columns} \column{0.4\linewidth} \textcolor{green!70!black}{OUR enhanced REPLICATION} The green line is same as above but over their full historical range. Red histogram is the number of papers studied in each year \column{0.6\linewidth} \vspace{1cm} \includegraphics[width=\linewidth]{img/tools-per-year.pdf} \end{columns} \end{frame} \ifdefined\longformat \makedemoslide{img/data-lineage-1.pdf} {Makefiles (\texttt{\*.mk}) keep contextually separate parts of the project, all imported into \texttt{top-make.mk}} \makedemoslide{img/data-lineage-2.pdf} {The ultimate purpose of the project is to produce a paper/report (in PDF).} \makedemoslide{img/data-lineage-3.pdf} {The narrative description, typography and references are in \texttt{paper.tex} \& \texttt{references.tex}.} \makedemoslide{img/data-lineage-4.pdf} {Analysis outputs (blended into the PDF as \LaTeX{} macros) come from \texttt{project.tex}.} \makedemoslide{img/data-lineage-5.pdf} {But analysis outputs must first be \emph{verified} (with checksums) before entering the report/paper.} \makedemoslide{img/data-lineage-6.pdf} {Basic project info comes from \texttt{initialize.tex}.} \makedemoslide{img/data-lineage-7.pdf} {The paper includes some information about the plot.} \makedemoslide{img/data-lineage-8.pdf} {The final plotted data are calculated and stored in \texttt{tools-per-year.txt}.} \makedemoslide{img/data-lineage-9.pdf} {The plot's calculation is done on a formatted sub-set of the raw input data.} \makedemoslide{img/data-lineage-10.pdf} {The raw data that were downloaded are stored in XLSX format.} \makedemoslide{img/data-lineage-11.pdf} {The download URL \emph{and} a \alert{checksum to validate} the raw inputs, are stored in \texttt{INPUTS.conf}.} \makedemoslide{img/data-lineage-12.pdf} {We also need to report the URL in the paper...} \makedemoslide{img/data-lineage-13.pdf} {Some general info about the full dataset may also be reported.} \fi \ifdefined\longformat \makedemoslide{img/data-lineage-14.pdf} {We report the number of papers studied in a special year, desired year is stored in \texttt{.conf} file.} \else \makedemoslide{img/data-lineage-14.pdf} {All analysis steps cascade down to paper.pdf (URL and checksum of input in \texttt{INPUTS.conf}).} \fi \makedemoslide{img/data-lineage-15.pdf} {It is very easy to expand the project and add new analysis steps (this solution is scalable)} \begin{frame}{Files organized in directories by context (here are some of the files discussed before)} \centering \includegraphics[width=0.85\linewidth]{img/figure-file-architecture-1.pdf} \end{frame} \begin{frame}{Files organized in directories by context (now with other project files and symbolic links)} \centering \includegraphics[width=0.85\linewidth]{img/figure-file-architecture-2.pdf} \end{frame} \newcommand{\allopacity}{1} \begin{frame}{All questions have an answer now (in \alert{plain text}: human \& computer readable/archivable).} \include{tex/project-graph} \end{frame} \newcommand{\gitlogo}{} \begin{frame}{All questions have an answer now (in \alert{plain text}: so we can use Git to keep its history).} \include{tex/project-graph} \end{frame} \ifdefined\longformat \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \newcommand{\tomorrow}{1} \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \newcommand{\abstractify}{1} \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \newcommand{\projinit}{} \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \newcommand{\projwork}{} \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \newcommand{\tempevolve}{} \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \newcommand{\mergewithtemp}{} \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \newcommand{\tofuture}{} \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \newcommand{\githappy}{} \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \newcommand{\gitverified}{} \else \newcommand{\abstractify}{1} \newcommand{\projinit}{} \newcommand{\projwork}{} \newcommand{\tempevolve}{} \newcommand{\mergewithtemp}{} \newcommand{\tofuture}{} \newcommand{\githappy}{} \newcommand{\gitverified}{} \fi \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \ifdefined\longformat \begin{frame}{Two recent examples (publishing Git checksum in abstract)} \begin{columns} \column{0.5\linewidth} \centering \includegraphics[width=0.8\linewidth]{img/firstpage-190911230.png} \column{0.5\linewidth} \centering \includegraphics[width=0.8\linewidth]{img/firstpage-mnras491.png} \end{columns} \end{frame} \fi \begin{frame}{Two recent examples (publishing Git checksum in abstract)} \begin{columns} \column{0.5\linewidth} \centering \includegraphics[width=0.8\linewidth]{img/firstpage-190911230-highlighted.png} \column{0.5\linewidth} \centering \includegraphics[width=0.8\linewidth]{img/firstpage-mnras491-highlighted.png} \end{columns} \end{frame} \begin{frame}{Publication of the project} A reproducible project using Maneage will have the following (\alert{plain text}) components: \begin{itemize} \item Makefiles. \item \LaTeX{} source files. \item Configuration files for software used in analysis. \item Scripts/programming files (e.g., Python, Shell, AWK, C). \end{itemize} The \alert{volume} of the project's source will thus be \alert{negligible} compared to a single figure in a paper (usually $\sim100$ kilo-bytes). \vspace{6mm} The project's pipeline (customized Maneage) can be \alert{published} in \begin{itemize} \item \alert{arXiv}: uploaded with the \LaTeX{} source to always stay with the paper \\(for example \textcolor{blue}{\href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230}}, \textcolor{blue}{\href{https://arxiv.org/abs/1911.01430}{arXiv:1911.01430}}, \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}, \textcolor{blue}{\href{https://arxiv.org/abs/2007.11779}{arXiv:2007.11779}}\\ \textcolor{blue}{\href{https://arxiv.org/abs/2010.03742}{arXiv:2010.03742}}, \textcolor{blue}{\href{https://arxiv.org/abs/2112.14174}{arXiv:2112.14174}}). \item \alert{Zenodo}: Along with all the input datasets (many Gigabytes) and software \\(for example \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.6533902}{zenodo.6533902}}, also see comments in arXiv links above) and given a unique DOI. \item \alert{Software Heritage}: to archive the full version-controlled history of the project.\\(for example {\small \textcolor{blue}{\href{https://archive.softwareheritage.org/swh:1:dir:33fea87068c1612daf011f161b97787b9a0df39f;origin=http://git.maneage.org/paper-concept.git/;visit=swh:1:snp:89af43c4b076a17d9298299f224247038af355ea;anchor=swh:1:rev:313db0b04bd3499f83d9e79fd7e92578cd367c2b}{swh:1:dir:33fea87068c1612daf011f161b97787b9a0df39fk}}}) \end{itemize} \end{frame} \begin{frame}{Software Heritage IDs (SWHID); persistent identifier for source code (or any text!)} \vspace{5mm} \includegraphics[width=\linewidth]{img/SWHIDs.png} \vspace{5mm} {\hfill\small For more details, see SoftwareHeritage FAQ (at \textcolor{blue}{\url{https://www.softwareheritage.org/faq}}}) \end{frame} \begin{frame}[t]{Executing a Maneaged project (for example \href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018})} \vspace{1cm} \texttt{\$ git clone https://gitlab.com/makhlaghi/maneage-paper{ }{ }{ }{ }\textcolor{gray}{\# Import the project.}}\\ \ifdefined\longformat\pause\fi \vspace{1.5cm} \texttt{\$ ./project configure { }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# You will specify the build directory on your system,}}\\ \texttt{{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# and it will build all software (about 1.5 hours).}} \ifdefined\longformat\pause\fi \vspace{1.5cm} \texttt{\$ ./project make { }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# Does all the analysis and makes final PDF.}}\\ \end{frame} \begin{frame}{Future prospects...} \large Adoption of reproducibility by many researchers will enable the following: \vspace{1em} \begin{itemize} \setlength\itemsep{3mm} \item A repository for education/training \textcolor{gray}{(PhD students, or researchers in other fields)}. \item Easy \alert{verification}/\alert{understanding} of other research projects \textcolor{gray}{(when necessary)}. \item Trivially \alert{test} different steps of others' work \textcolor{gray}{(different configurations, software and etc)}. \item Science can progress \alert{incrementally} \textcolor{gray}{(shorter papers actually building on each other!)}. \item \alert{Extract meta-data} after the publication of a dataset \textcolor{gray}{(for future ontologies or vocabularies)}. \item Applying \alert{machine learning} on reproducible research projects will allow us to solve some Big Data Challenges: \vspace{1em} \begin{itemize} \setlength\itemsep{2mm} \item \emph{Extract the relevant parameters automatically}. \item \emph{Translate the science to enormous samples}. \item \emph{Believe the results when no one will have time to reproduce}. \item \emph{Have confidence in results derived using machine learning or AI}. \end{itemize} \end{itemize} \end{frame} \begin{frame}{Summary:} Maneage (\textcolor{blue}{\url{https://maneage.org}}) is a customizable template that will for research or data reduction: \begin{itemize} \item \alert{Automatically downloads} the necessary \emph{software} and \emph{data}. \item \alert{Builds} the software in a \alert{closed environment}. \item Runs the software on data to \alert{generate} the final \alert{research results}. \item Modification of part of the analysis will only result in re-doing that part, not the whole project. \item Using LaTeX macros, paper's figures, tables and numbers will be \alert{Automatically updated}. \item The whole project is under \alert{version control} (Git) to allow easy reversion to a previous state. This \alert{encourages tests/experimentation} in the analysis. \item The \alert{Git commit hash} of the project source, is \alert{printed} in the published paper and \alert{saved on output} data products. Ensuring the integrity/reproducibility of the result. \item \colorbox{green!30!white}{These slides are available at \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro-short.pdf}}.} \item \colorbox{green!15!white}{Longer slides are available at \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.} \begin{itemize} \item YouTube recording (May 2021): \textcolor{blue}{\url{https://www.youtube.com/watch?v=XdhRUhoMqw0}} \end{itemize} \item \colorbox{purple!20!white}{\small Matrix-protocol chat room: \texttt{\#maneage-general:matrix.org}} \end{itemize} \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, top=1pt, bottom=1pt] For a technical description of Maneage's implementation, as well as a checklist to customize it, and tips on good practices, please see this page: \textcolor{blue}{\footnotesize\url{https://gitlab.com/maneage/project/-/blob/maneage/README-hacking.md}} \end{tcolorbox} \end{frame} \end{document}