From 1f063ee8276a5653ec59939d79f1a999bf7c529d Mon Sep 17 00:00:00 2001 From: Mohammad Akhlaghi Date: Tue, 13 Mar 2018 11:45:40 +0100 Subject: Renamed main file to reproducible-paper.tex The name of the repository and the various links to it are `reproducibile-paper', so its better that the slides also have a similar name. --- reproducible-paper.tex | 258 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 258 insertions(+) create mode 100644 reproducible-paper.tex (limited to 'reproducible-paper.tex') diff --git a/reproducible-paper.tex b/reproducible-paper.tex new file mode 100644 index 0000000..d9ce898 --- /dev/null +++ b/reproducible-paper.tex @@ -0,0 +1,258 @@ +\documentclass[9pt]{beamer} + + +%% Beamer settings. +\setbeamertemplate{footline}[frame number] + + +%% Packages to import. +\usepackage{tcolorbox} %For a color-box. +\usepackage{textcomp} %For a copyright sign. + + +%% To simplify arXiv links +\newcommand{\arxivlink}[1]{{\footnotesize + (\textcolor{blue}{\href{https://arxiv.org/abs/#1}{arXiv:#1}})}} + + + +%% Set the title +\title{Reproducible scientific research in the era of big data} + + +%% Set the author +\author{Mohammad Akhlaghi\\\vspace{2mm}\footnotesize Centre de + Recherche Astrophysique de Lyon({\scriptsize CRAL}),\\Universit\'e de + Lyon, France.\\ + \vspace{1.5cm} + \includegraphics[width=3.5cm]{img/muse.png}\\ + \includegraphics[width=1.4cm]{img/cral.png} + \includegraphics[width=1.9cm]{img/univ-lyon.png} + \includegraphics[width=1cm]{img/cnrs.png} + \includegraphics[width=1cm]{img/erc.png}\\ +} + + +%% Set the date and insitutional logos. +\date{} + + + + + + + + + + +\begin{document} + + \begin{frame} + \titlepage + \end{frame} + + + \begin{frame}{Necessity of (exactly) reproducible research} + \begin{itemize} + \setlength\itemsep{0.3cm} + \item To be considered \alert{scientific}, any result has to be + reproducible. + \item The tsunami of data, fast internet, and high processing + power have made it very easy to \alert{promptly arrive at a + result}. + \item But these factors have also greatly increased the + \alert{complexity} of an analysis. Making it impossible to + exactly descibe all steps in a published paper. + \item Most scientific papers thus ignore the ``details'' (as they + interpret it). + \item But due to the complexity, even a small deviation from the + exact result, can be due to many different parts of the + analysis. Hence, its \alert{critical to exactly reproduce} a + result. + \item The software(s) used, configuration file(s), the order of + steps taken, along with the input data are necessary for + reproducibility. + \item \alert{A solution} is proposed here, which if adopted from + the start, can greatly \alert{simplify a scientific research + project} and \alert{allow full/exact reproducibility} once it + is published. + \end{itemize} + \end{frame} + + + + \begin{frame}{Values in final report/paper} + All necessary analysis/processing \alert{input} and \alert{output} + values are writen into the final report as \LaTeX{} macros. Shown + here is a portion of the \textsf{NoiseChisel} paper and its source + (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). + + \vspace{1.2cm} + \includegraphics[width=\linewidth]{img/reproducible-latex.png} + \end{frame} + + \begin{frame}{Values in final report/paper} + All necessary analysis/processing \alert{input} and \alert{output} + values are writen into the final report as \LaTeX{} macros. Shown + here is a portion of the \textsf{NoiseChisel} paper and its source + (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). + + \vspace{1.2cm} + \includegraphics[width=\linewidth]{img/reproducible-latex-highlighted.png} + \end{frame} + + + \begin{frame}{Values are the pipeline's final product} + All the \LaTeX{} macros (processing inputs and outputs) come from + a \alert{single file}. This file is the \alert{final product} of + the \emph{reproduction pipeline}. + + \begin{center} + \includegraphics[width=0.8\linewidth]{img/reproducible-macros.png} + \end{center} + \end{frame} + + + + \begin{frame}{Values are the pipeline's final product} + All the \LaTeX{} macros (processing inputs and outputs) come from + a \alert{single file}. This file is the \alert{final product} of + the \emph{reproduction pipeline}. + + \begin{center} + \includegraphics[width=0.8\linewidth]{img/reproducible-macros-highlighted.png} + \end{center} + \end{frame} + + + \begin{frame}{Values written during analysis} + Various steps of the analysis pipeline write the macro values as + soon as they are calculated internally. + + \begin{center} + \includegraphics[width=0.8\linewidth]{img/reproducible-write-macro.png} + \end{center} + \end{frame} + + + \begin{frame}{Values written during analysis} + Various steps of the analysis pipeline write the macro values as + soon as they are calculated internally. + + \begin{center} + \includegraphics[width=0.8\linewidth]{img/reproducible-write-macro-highlight.png} + \end{center} + \end{frame} + + + \begin{frame}{Reproducible science: Pipeline is managed through a Makefile} + \small + \begin{columns} + \column{5.5cm} + + The whole pipeline is managed by Makefiles (example from + \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}): + + \begin{itemize} + \setlength\itemsep{0.2cm} + \item Unlike a script which always starts from the top, a + Makefile \alert{starts from the end} and steps that don't + change will be left untouched (not remade). + \item A single \emph{rule} can \alert{manage any number of + files}. See the examples here where \textsf{NoiseChisel} and + \textsf{MakeCatalog} are run separately on \alert{$\sim20$ + files} (different filters/fields) with a single rule. + \item Make can identify independent steps internally and do them + in \alert{parallel}. + \item Make was \alert{designed for complex problems} with + thousands of files (all major Unix-like components), so it is + highly evolved and efficient. + \item Make is a very \alert{simple} and \alert{small} language, + thus easy to learn with great and free documentation (for + example + \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU + Make's manual}}, usable to learn all implementations). + \end{itemize} + + \column{5.5cm} + \includegraphics[width=\linewidth]{img/reproducible-makefile.png} + \end{columns} + \end{frame} + + + \begin{frame}{Reproducing the result and report/paper} + Once software dependencies are installed, the two \alert{simple} + and \alert{familiar} commands below are enough to exactly + reproduce the results at any time (as in + \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}): + + \begin{itemize} + \item[] \texttt{\$ ./configure{ }{ }{ }{ }{ }{ }\# To + define top-level local directories.} + \item[] \texttt{\$ make{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\# To reproduce the analysis and paper.} + \end{itemize} + + \vspace{0.5cm} Enabling version control (e.g. \alert{Git}) will + make it very easy to test different ideas while not harming the + initial/base result (thus encouraging \alert{creativity} and + brainstorming during the project). + + \vspace{0.5cm} The pipeline can also \alert{download input} data + from online archives (databases) if not locally available (as in + \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}} + and + \textcolor{blue}{\href{https://gitlab.com/makhlaghi/reproduction-pipeline-template}{template}}). + + \vspace{0.5cm} After publication, \alert{readers} can + \alert{change} the input configurations and the numbers and + figures of the reproduced paper will respectively change. This + encourages creativity and brainstorming after the project as well + as sharing of (the hardly gained) experiences with the whole + community. + \end{frame} + + + + \begin{frame}{Publication of the pipeline} + + A reproduction pipeline like this will have the following + (\alert{plain text}) components: + \begin{itemize} + \item Makefiles. + \item \LaTeX{} source files. + \item Configuration files. + \item Scripts/programming files (e.g., Python, Shell, AWK, C). + \end{itemize} + The \alert{volume} of the reproduction pipeline will thus be + \alert{negligible} compared to a single figure in a paper + (especially after compression). + + \vspace{1.5cm} The reproduction pipeline can be \alert{published} in + \begin{itemize} + \item \alert{arXiv}: uploaded with the \TeX{} source to always + stay with the paper \\(for example + \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). The + file containing all macros must also be uploaded so arXiv's + server can easily build the \LaTeX{} source. + \item \alert{Zenodo}: Along with all the input datasets (many + Gigabytes) and software \\(for example + \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}) and given a unique DOI. + \end{itemize} + + \end{frame} + + + + \begin{frame} + A template/blank pipeline has been written and is ready to use, + with implementation guidelines and practical tips and + recommendations: + + \textcolor{blue}{\url{https://gitlab.com/makhlaghi/reproducible-paper}} + + \vspace{2.5cm} + Please see this page for more: + + \textcolor{blue}{\url{http://akhlaghi.org/reproducible-science.html}} + \end{frame} +\end{document} -- cgit v1.2.1