aboutsummaryrefslogtreecommitdiff
path: root/reproduction-pipeline.tex
blob: d9ce898be750abb5ca87d98eac1dec97f5804bb7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
\documentclass[9pt]{beamer}


%% Beamer settings.
\setbeamertemplate{footline}[frame number]


%% Packages to import.
\usepackage{tcolorbox}          %For a color-box.
\usepackage{textcomp}           %For a copyright sign.


%% To simplify arXiv links
\newcommand{\arxivlink}[1]{{\footnotesize
  (\textcolor{blue}{\href{https://arxiv.org/abs/#1}{arXiv:#1}})}}



%% Set the title
\title{Reproducible scientific research in the era of big data}


%% Set the author
\author{Mohammad Akhlaghi\\\vspace{2mm}\footnotesize Centre de
  Recherche Astrophysique de Lyon({\scriptsize CRAL}),\\Universit\'e de
  Lyon, France.\\
  \vspace{1.5cm}
  \includegraphics[width=3.5cm]{img/muse.png}\\
  \includegraphics[width=1.4cm]{img/cral.png}
  \includegraphics[width=1.9cm]{img/univ-lyon.png}
  \includegraphics[width=1cm]{img/cnrs.png}
  \includegraphics[width=1cm]{img/erc.png}\\
}


%% Set the date and insitutional logos.
\date{}










\begin{document}

  \begin{frame}
    \titlepage
  \end{frame}


  \begin{frame}{Necessity of (exactly) reproducible research}
    \begin{itemize}
      \setlength\itemsep{0.3cm}
    \item To be considered \alert{scientific}, any result has to be
      reproducible.
    \item The tsunami of data, fast internet, and high processing
      power have made it very easy to \alert{promptly arrive at a
        result}.
    \item But these factors have also greatly increased the
      \alert{complexity} of an analysis. Making it impossible to
      exactly descibe all steps in a published paper.
    \item Most scientific papers thus ignore the ``details'' (as they
      interpret it).
    \item But due to the complexity, even a small deviation from the
      exact result, can be due to many different parts of the
      analysis. Hence, its \alert{critical to exactly reproduce} a
      result.
     \item The software(s) used, configuration file(s), the order of
       steps taken, along with the input data are necessary for
       reproducibility.
     \item \alert{A solution} is proposed here, which if adopted from
       the start, can greatly \alert{simplify a scientific research
         project} and \alert{allow full/exact reproducibility} once it
       is published.
    \end{itemize}
  \end{frame}



  \begin{frame}{Values in final report/paper}
    All necessary analysis/processing \alert{input} and \alert{output}
    values are writen into the final report as \LaTeX{} macros. Shown
    here is a portion of the \textsf{NoiseChisel} paper and its source
    (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}).

    \vspace{1.2cm}
    \includegraphics[width=\linewidth]{img/reproducible-latex.png}
  \end{frame}

  \begin{frame}{Values in final report/paper}
    All necessary analysis/processing \alert{input} and \alert{output}
    values are writen into the final report as \LaTeX{} macros. Shown
    here is a portion of the \textsf{NoiseChisel} paper and its source
    (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}).

    \vspace{1.2cm}
    \includegraphics[width=\linewidth]{img/reproducible-latex-highlighted.png}
  \end{frame}


  \begin{frame}{Values are the pipeline's final product}
    All the \LaTeX{} macros (processing inputs and outputs) come from
    a \alert{single file}. This file is the \alert{final product} of
    the \emph{reproduction pipeline}.

    \begin{center}
    \includegraphics[width=0.8\linewidth]{img/reproducible-macros.png}
    \end{center}
  \end{frame}



  \begin{frame}{Values are the pipeline's final product}
    All the \LaTeX{} macros (processing inputs and outputs) come from
    a \alert{single file}. This file is the \alert{final product} of
    the \emph{reproduction pipeline}.

    \begin{center}
    \includegraphics[width=0.8\linewidth]{img/reproducible-macros-highlighted.png}
    \end{center}
  \end{frame}


  \begin{frame}{Values written during analysis}
    Various steps of the analysis pipeline write the macro values as
    soon as they are calculated internally.

    \begin{center}
    \includegraphics[width=0.8\linewidth]{img/reproducible-write-macro.png}
    \end{center}
  \end{frame}


  \begin{frame}{Values written during analysis}
    Various steps of the analysis pipeline write the macro values as
    soon as they are calculated internally.

    \begin{center}
    \includegraphics[width=0.8\linewidth]{img/reproducible-write-macro-highlight.png}
    \end{center}
  \end{frame}


  \begin{frame}{Reproducible science: Pipeline is managed through a Makefile}
    \small
    \begin{columns}
      \column{5.5cm}

      The whole pipeline is managed by Makefiles (example from
      \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}):

      \begin{itemize}
        \setlength\itemsep{0.2cm}
      \item Unlike a script which always starts from the top, a
        Makefile \alert{starts from the end} and steps that don't
        change will be left untouched (not remade).
      \item A single \emph{rule} can \alert{manage any number of
        files}. See the examples here where \textsf{NoiseChisel} and
        \textsf{MakeCatalog} are run separately on \alert{$\sim20$
          files} (different filters/fields) with a single rule.
      \item Make can identify independent steps internally and do them
        in \alert{parallel}.
      \item Make was \alert{designed for complex problems} with
        thousands of files (all major Unix-like components), so it is
        highly evolved and efficient.
      \item Make is a very \alert{simple} and \alert{small} language,
        thus easy to learn with great and free documentation (for
        example
        \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU
            Make's manual}}, usable to learn all implementations).
      \end{itemize}

      \column{5.5cm}
      \includegraphics[width=\linewidth]{img/reproducible-makefile.png}
    \end{columns}
  \end{frame}


  \begin{frame}{Reproducing the result and report/paper}
    Once software dependencies are installed, the two \alert{simple}
    and \alert{familiar} commands below are enough to exactly
    reproduce the results at any time (as in
    \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}):

    \begin{itemize}
    \item[] \texttt{\$ ./configure{ }{ }{ }{ }{ }{ }\# To
      define top-level local directories.}
    \item[] \texttt{\$ make{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\# To reproduce the analysis and paper.}
    \end{itemize}

    \vspace{0.5cm} Enabling version control (e.g. \alert{Git}) will
    make it very easy to test different ideas while not harming the
    initial/base result (thus encouraging \alert{creativity} and
    brainstorming during the project).

    \vspace{0.5cm} The pipeline can also \alert{download input} data
    from online archives (databases) if not locally available (as in
    \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}
    and
    \textcolor{blue}{\href{https://gitlab.com/makhlaghi/reproduction-pipeline-template}{template}}).

    \vspace{0.5cm} After publication, \alert{readers} can
    \alert{change} the input configurations and the numbers and
    figures of the reproduced paper will respectively change. This
    encourages creativity and brainstorming after the project as well
    as sharing of (the hardly gained) experiences with the whole
    community.
  \end{frame}



  \begin{frame}{Publication of the pipeline}

    A reproduction pipeline like this will have the following
    (\alert{plain text}) components:
    \begin{itemize}
    \item Makefiles.
    \item \LaTeX{} source files.
    \item Configuration files.
    \item Scripts/programming files (e.g., Python, Shell, AWK, C).
    \end{itemize}
    The \alert{volume} of the reproduction pipeline will thus be
    \alert{negligible} compared to a single figure in a paper
    (especially after compression).

    \vspace{1.5cm} The reproduction pipeline can be \alert{published} in
    \begin{itemize}
    \item \alert{arXiv}: uploaded with the \TeX{} source to always
      stay with the paper \\(for example
      \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). The
      file containing all macros must also be uploaded so arXiv's
      server can easily build the \LaTeX{} source.
    \item \alert{Zenodo}: Along with all the input datasets (many
      Gigabytes) and software \\(for example
      \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}) and given a unique DOI.
    \end{itemize}

  \end{frame}



  \begin{frame}
    A template/blank pipeline has been written and is ready to use,
    with implementation guidelines and practical tips and
    recommendations:

    \textcolor{blue}{\url{https://gitlab.com/makhlaghi/reproducible-paper}}

    \vspace{2.5cm}
    Please see this page for more:

    \textcolor{blue}{\url{http://akhlaghi.org/reproducible-science.html}}
  \end{frame}
\end{document}