aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMohammad Akhlaghi <mohammad@akhlaghi.org>2018-03-13 11:31:26 +0100
committerMohammad Akhlaghi <mohammad@akhlaghi.org>2018-03-13 11:38:32 +0100
commit10f9dfe4801268b68d8526ba4507abb324cf349c (patch)
tree99d4a2be15cc42737fc8a1dac9f6c21ddcb7ce42
Started version control
The first version of these slides was presented in a talk I gave at CRAL on the 9th of February (placed at the end of slides on the science topic I was talking about). Later, I separated them into an independent set of slides to help facilitate the discussions I was having and also to upload on my webpage. As the concept is evolving, I found my self having to make changes to the slides, so to keep track of the slides and history of the changes, I thought of making this repository.
-rw-r--r--.gitignore8
-rw-r--r--img/cnrs.pngbin0 -> 16388 bytes
-rw-r--r--img/cral.pngbin0 -> 45217 bytes
-rw-r--r--img/erc.pngbin0 -> 48016 bytes
-rw-r--r--img/muse.pngbin0 -> 63621 bytes
-rw-r--r--img/reproducible-latex-highlighted.pngbin0 -> 355501 bytes
-rw-r--r--img/reproducible-latex.pngbin0 -> 354742 bytes
-rw-r--r--img/reproducible-macros-highlighted.pngbin0 -> 302253 bytes
-rw-r--r--img/reproducible-macros.pngbin0 -> 301514 bytes
-rw-r--r--img/reproducible-makefile.pngbin0 -> 380119 bytes
-rw-r--r--img/reproducible-write-macro-highlight.pngbin0 -> 184251 bytes
-rw-r--r--img/reproducible-write-macro.pngbin0 -> 184317 bytes
-rw-r--r--img/univ-lyon.pngbin0 -> 16867 bytes
-rw-r--r--reproduction-pipeline.tex258
14 files changed, 266 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5af27e9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+*~
+*.aux
+*.log
+*.nav
+*.out
+*.pdf
+*.snm
+*.toc \ No newline at end of file
diff --git a/img/cnrs.png b/img/cnrs.png
new file mode 100644
index 0000000..c93f342
--- /dev/null
+++ b/img/cnrs.png
Binary files differ
diff --git a/img/cral.png b/img/cral.png
new file mode 100644
index 0000000..2780f8a
--- /dev/null
+++ b/img/cral.png
Binary files differ
diff --git a/img/erc.png b/img/erc.png
new file mode 100644
index 0000000..8221779
--- /dev/null
+++ b/img/erc.png
Binary files differ
diff --git a/img/muse.png b/img/muse.png
new file mode 100644
index 0000000..80f83b4
--- /dev/null
+++ b/img/muse.png
Binary files differ
diff --git a/img/reproducible-latex-highlighted.png b/img/reproducible-latex-highlighted.png
new file mode 100644
index 0000000..e3c8bb2
--- /dev/null
+++ b/img/reproducible-latex-highlighted.png
Binary files differ
diff --git a/img/reproducible-latex.png b/img/reproducible-latex.png
new file mode 100644
index 0000000..61411c0
--- /dev/null
+++ b/img/reproducible-latex.png
Binary files differ
diff --git a/img/reproducible-macros-highlighted.png b/img/reproducible-macros-highlighted.png
new file mode 100644
index 0000000..4f5a346
--- /dev/null
+++ b/img/reproducible-macros-highlighted.png
Binary files differ
diff --git a/img/reproducible-macros.png b/img/reproducible-macros.png
new file mode 100644
index 0000000..7edc4bb
--- /dev/null
+++ b/img/reproducible-macros.png
Binary files differ
diff --git a/img/reproducible-makefile.png b/img/reproducible-makefile.png
new file mode 100644
index 0000000..9217644
--- /dev/null
+++ b/img/reproducible-makefile.png
Binary files differ
diff --git a/img/reproducible-write-macro-highlight.png b/img/reproducible-write-macro-highlight.png
new file mode 100644
index 0000000..a7a8ff7
--- /dev/null
+++ b/img/reproducible-write-macro-highlight.png
Binary files differ
diff --git a/img/reproducible-write-macro.png b/img/reproducible-write-macro.png
new file mode 100644
index 0000000..3add994
--- /dev/null
+++ b/img/reproducible-write-macro.png
Binary files differ
diff --git a/img/univ-lyon.png b/img/univ-lyon.png
new file mode 100644
index 0000000..cbbee20
--- /dev/null
+++ b/img/univ-lyon.png
Binary files differ
diff --git a/reproduction-pipeline.tex b/reproduction-pipeline.tex
new file mode 100644
index 0000000..d9ce898
--- /dev/null
+++ b/reproduction-pipeline.tex
@@ -0,0 +1,258 @@
+\documentclass[9pt]{beamer}
+
+
+%% Beamer settings.
+\setbeamertemplate{footline}[frame number]
+
+
+%% Packages to import.
+\usepackage{tcolorbox} %For a color-box.
+\usepackage{textcomp} %For a copyright sign.
+
+
+%% To simplify arXiv links
+\newcommand{\arxivlink}[1]{{\footnotesize
+ (\textcolor{blue}{\href{https://arxiv.org/abs/#1}{arXiv:#1}})}}
+
+
+
+%% Set the title
+\title{Reproducible scientific research in the era of big data}
+
+
+%% Set the author
+\author{Mohammad Akhlaghi\\\vspace{2mm}\footnotesize Centre de
+ Recherche Astrophysique de Lyon({\scriptsize CRAL}),\\Universit\'e de
+ Lyon, France.\\
+ \vspace{1.5cm}
+ \includegraphics[width=3.5cm]{img/muse.png}\\
+ \includegraphics[width=1.4cm]{img/cral.png}
+ \includegraphics[width=1.9cm]{img/univ-lyon.png}
+ \includegraphics[width=1cm]{img/cnrs.png}
+ \includegraphics[width=1cm]{img/erc.png}\\
+}
+
+
+%% Set the date and insitutional logos.
+\date{}
+
+
+
+
+
+
+
+
+
+
+\begin{document}
+
+ \begin{frame}
+ \titlepage
+ \end{frame}
+
+
+ \begin{frame}{Necessity of (exactly) reproducible research}
+ \begin{itemize}
+ \setlength\itemsep{0.3cm}
+ \item To be considered \alert{scientific}, any result has to be
+ reproducible.
+ \item The tsunami of data, fast internet, and high processing
+ power have made it very easy to \alert{promptly arrive at a
+ result}.
+ \item But these factors have also greatly increased the
+ \alert{complexity} of an analysis. Making it impossible to
+ exactly descibe all steps in a published paper.
+ \item Most scientific papers thus ignore the ``details'' (as they
+ interpret it).
+ \item But due to the complexity, even a small deviation from the
+ exact result, can be due to many different parts of the
+ analysis. Hence, its \alert{critical to exactly reproduce} a
+ result.
+ \item The software(s) used, configuration file(s), the order of
+ steps taken, along with the input data are necessary for
+ reproducibility.
+ \item \alert{A solution} is proposed here, which if adopted from
+ the start, can greatly \alert{simplify a scientific research
+ project} and \alert{allow full/exact reproducibility} once it
+ is published.
+ \end{itemize}
+ \end{frame}
+
+
+
+ \begin{frame}{Values in final report/paper}
+ All necessary analysis/processing \alert{input} and \alert{output}
+ values are writen into the final report as \LaTeX{} macros. Shown
+ here is a portion of the \textsf{NoiseChisel} paper and its source
+ (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}).
+
+ \vspace{1.2cm}
+ \includegraphics[width=\linewidth]{img/reproducible-latex.png}
+ \end{frame}
+
+ \begin{frame}{Values in final report/paper}
+ All necessary analysis/processing \alert{input} and \alert{output}
+ values are writen into the final report as \LaTeX{} macros. Shown
+ here is a portion of the \textsf{NoiseChisel} paper and its source
+ (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}).
+
+ \vspace{1.2cm}
+ \includegraphics[width=\linewidth]{img/reproducible-latex-highlighted.png}
+ \end{frame}
+
+
+ \begin{frame}{Values are the pipeline's final product}
+ All the \LaTeX{} macros (processing inputs and outputs) come from
+ a \alert{single file}. This file is the \alert{final product} of
+ the \emph{reproduction pipeline}.
+
+ \begin{center}
+ \includegraphics[width=0.8\linewidth]{img/reproducible-macros.png}
+ \end{center}
+ \end{frame}
+
+
+
+ \begin{frame}{Values are the pipeline's final product}
+ All the \LaTeX{} macros (processing inputs and outputs) come from
+ a \alert{single file}. This file is the \alert{final product} of
+ the \emph{reproduction pipeline}.
+
+ \begin{center}
+ \includegraphics[width=0.8\linewidth]{img/reproducible-macros-highlighted.png}
+ \end{center}
+ \end{frame}
+
+
+ \begin{frame}{Values written during analysis}
+ Various steps of the analysis pipeline write the macro values as
+ soon as they are calculated internally.
+
+ \begin{center}
+ \includegraphics[width=0.8\linewidth]{img/reproducible-write-macro.png}
+ \end{center}
+ \end{frame}
+
+
+ \begin{frame}{Values written during analysis}
+ Various steps of the analysis pipeline write the macro values as
+ soon as they are calculated internally.
+
+ \begin{center}
+ \includegraphics[width=0.8\linewidth]{img/reproducible-write-macro-highlight.png}
+ \end{center}
+ \end{frame}
+
+
+ \begin{frame}{Reproducible science: Pipeline is managed through a Makefile}
+ \small
+ \begin{columns}
+ \column{5.5cm}
+
+ The whole pipeline is managed by Makefiles (example from
+ \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}):
+
+ \begin{itemize}
+ \setlength\itemsep{0.2cm}
+ \item Unlike a script which always starts from the top, a
+ Makefile \alert{starts from the end} and steps that don't
+ change will be left untouched (not remade).
+ \item A single \emph{rule} can \alert{manage any number of
+ files}. See the examples here where \textsf{NoiseChisel} and
+ \textsf{MakeCatalog} are run separately on \alert{$\sim20$
+ files} (different filters/fields) with a single rule.
+ \item Make can identify independent steps internally and do them
+ in \alert{parallel}.
+ \item Make was \alert{designed for complex problems} with
+ thousands of files (all major Unix-like components), so it is
+ highly evolved and efficient.
+ \item Make is a very \alert{simple} and \alert{small} language,
+ thus easy to learn with great and free documentation (for
+ example
+ \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU
+ Make's manual}}, usable to learn all implementations).
+ \end{itemize}
+
+ \column{5.5cm}
+ \includegraphics[width=\linewidth]{img/reproducible-makefile.png}
+ \end{columns}
+ \end{frame}
+
+
+ \begin{frame}{Reproducing the result and report/paper}
+ Once software dependencies are installed, the two \alert{simple}
+ and \alert{familiar} commands below are enough to exactly
+ reproduce the results at any time (as in
+ \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}):
+
+ \begin{itemize}
+ \item[] \texttt{\$ ./configure{ }{ }{ }{ }{ }{ }\# To
+ define top-level local directories.}
+ \item[] \texttt{\$ make{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\# To reproduce the analysis and paper.}
+ \end{itemize}
+
+ \vspace{0.5cm} Enabling version control (e.g. \alert{Git}) will
+ make it very easy to test different ideas while not harming the
+ initial/base result (thus encouraging \alert{creativity} and
+ brainstorming during the project).
+
+ \vspace{0.5cm} The pipeline can also \alert{download input} data
+ from online archives (databases) if not locally available (as in
+ \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}
+ and
+ \textcolor{blue}{\href{https://gitlab.com/makhlaghi/reproduction-pipeline-template}{template}}).
+
+ \vspace{0.5cm} After publication, \alert{readers} can
+ \alert{change} the input configurations and the numbers and
+ figures of the reproduced paper will respectively change. This
+ encourages creativity and brainstorming after the project as well
+ as sharing of (the hardly gained) experiences with the whole
+ community.
+ \end{frame}
+
+
+
+ \begin{frame}{Publication of the pipeline}
+
+ A reproduction pipeline like this will have the following
+ (\alert{plain text}) components:
+ \begin{itemize}
+ \item Makefiles.
+ \item \LaTeX{} source files.
+ \item Configuration files.
+ \item Scripts/programming files (e.g., Python, Shell, AWK, C).
+ \end{itemize}
+ The \alert{volume} of the reproduction pipeline will thus be
+ \alert{negligible} compared to a single figure in a paper
+ (especially after compression).
+
+ \vspace{1.5cm} The reproduction pipeline can be \alert{published} in
+ \begin{itemize}
+ \item \alert{arXiv}: uploaded with the \TeX{} source to always
+ stay with the paper \\(for example
+ \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). The
+ file containing all macros must also be uploaded so arXiv's
+ server can easily build the \LaTeX{} source.
+ \item \alert{Zenodo}: Along with all the input datasets (many
+ Gigabytes) and software \\(for example
+ \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}) and given a unique DOI.
+ \end{itemize}
+
+ \end{frame}
+
+
+
+ \begin{frame}
+ A template/blank pipeline has been written and is ready to use,
+ with implementation guidelines and practical tips and
+ recommendations:
+
+ \textcolor{blue}{\url{https://gitlab.com/makhlaghi/reproducible-paper}}
+
+ \vspace{2.5cm}
+ Please see this page for more:
+
+ \textcolor{blue}{\url{http://akhlaghi.org/reproducible-science.html}}
+ \end{frame}
+\end{document}