aboutsummaryrefslogtreecommitdiff
path: root/slides-intro-short.tex
diff options
context:
space:
mode:
authorMohammad Akhlaghi <mohammad@akhlaghi.org>2020-06-18 02:52:26 +0100
committerMohammad Akhlaghi <mohammad@akhlaghi.org>2020-06-18 02:52:26 +0100
commitd1faba600ae780fa7ba1994cf38504b75119d644 (patch)
tree35db4c13cab378d861f3ff66495d1f06a1d0c9e2 /slides-intro-short.tex
parente543088567ce06626f5b94219d63bddafb1d8d6f (diff)
Compled short slides, with simple macro to cut off increments
The short version of the slides is now ready. There is a '\longformat' macro that will significantly increase the number of slides, but not substantially (they are just the incremental things). Some minor modifications were also made in the long version.
Diffstat (limited to 'slides-intro-short.tex')
-rw-r--r--slides-intro-short.tex298
1 files changed, 158 insertions, 140 deletions
diff --git a/slides-intro-short.tex b/slides-intro-short.tex
index 88c14aa..79506ea 100644
--- a/slides-intro-short.tex
+++ b/slides-intro-short.tex
@@ -18,6 +18,9 @@
% Basic LaTeX settings.
\documentclass[9pt,usenames,dvipsnames,aspectratio=169]{beamer}
+% Make it super short.
+%\newcommand{\longformat}{}
+
% Read the current Git commit information
\include{git-commit}
\include{tex/preamble}
@@ -34,8 +37,8 @@
(\textcolor{blue}{\href{https://arxiv.org/abs/#1}{arXiv:#1}})}}
%% Set the title
-\title{{\huge\textbf{BIG} Data, \textbf{BIG} responsibility} \\
- {\LARGE Towards Long-term and Archivable Reproducibility} \\
+\title{Introducing Maneage:\\
+ Customizable framework for managing data lineage\\
\vspace{2mm}{\small [RDA Europe Adoption grant recipient. Submitted to \href{https://www.computer.org/csdl/magazine/cs}{IEEE CiSE} (\textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}), Comments welcome]}
}
@@ -89,6 +92,7 @@
\begin{frame}
\titlepage
\end{frame}
+ \usebackgroundtemplate{ } %% undeclare it
\begin{frame}{Challenges of the RDA-WDS Publishing Data Workflows WG {\small (DOI:\href{https://doi.org/10.1007/s00799-016-0178-2}{10.1007/s00799-016-0178-2})}}
@@ -105,7 +109,8 @@
\includegraphics[width=4cm]{img/wds.jpg}
\end{center}
- \pause
+ \ifdefined\longformat\pause\fi
+
``\emph{We would like to see a workflow that results in all
\textcolor{blue!30!green}{\bf scholarly objects being connected},
linked, citable, and persistent to allow researchers to navigate
@@ -118,13 +123,14 @@
discoveries.}''
\end{frame}
-\usebackgroundtemplate{ } %% undeclare it
-
-
\newcommand{\allopacity}{1}
+ \ifdefined\longformat
\begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+ \fi
\newcommand{\paperinit}{}
+ \ifdefined\longformat
\begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
+ \fi
\newcommand{\sver}{}
\newcommand{\srep}{}
\newcommand{\dver}{}
@@ -189,7 +195,7 @@
result.
\end{tcolorbox}
- \pause
+ \ifdefined\longformat\pause\fi
\begin{itemize}
\item \textbf{Complete/self-contained:}
\begin{itemize}
@@ -199,25 +205,25 @@
\item Should be usable \alert{without internet} connection.
\end{itemize}
- \pause
+ \ifdefined\longformat\pause\fi
\item \textbf{Modularity:} Parts of the project should be \alert{re-usable} in other projects.
- \pause
+ \ifdefined\longformat\pause\fi
\item \textbf{Plain text:} Project's source should be in \alert{plain-text} \textcolor{gray}{(binary formats need special software)}
\begin{itemize}
\item This includes high-level analysis.
\item It is easily publishable (very low volume of $\times100$KB), archivable, and parse-able.
\item \alert{Version control} (e.g., with Git) can track project's history.
\end{itemize}
- \pause
+ \ifdefined\longformat\pause\fi
\item \textbf{Minimal complexity:} Occum’s rasor: “Never posit pluralities without necessity”.
\begin{itemize}
\item Avoiding the \alert{fashionable} tool of the day: tomorrow another tool will take its place!
\item Easier \alert{learning curve}, also doesn't create a \alert{generational gap}.
\item Is \alert{compatible} and \alert{extensible}.
\end{itemize}
- \pause
+ \ifdefined\longformat\pause\fi
\item \textbf{Verifable inputs and outputs:} Inputs and Outputs must be \alert{automatically verified}.
- \pause
+ \ifdefined\longformat\pause\fi
\item \textbf{Free and open source software:} \alert{Free software} is essential: non-free software is not configurable, not distributable, and dependent on non-free provider (which may discontinue it in N years).
\end{itemize}
\end{frame}
@@ -231,7 +237,7 @@
-
+ \ifdefined\longformat
\begin{frame}{Predefined/exact software tools}
\small
\begin{columns}
@@ -266,7 +272,7 @@
\includegraphics[width=\linewidth]{img/version.png}
\end{columns}
\end{frame}
-
+ \fi
@@ -310,30 +316,6 @@
- \begin{frame}{Controlled environment and build instructions}
- \small
- \begin{columns}
- \column{5.5cm}
- \includegraphics[width=0.9\linewidth]{img/env.png}
- \column{5.5cm}
- \includegraphics[width=0.9\linewidth]{img/build.png}
- \end{columns}
- \end{frame}
-
- \begin{frame}{Controlled environment and build instructions}
- \small
- \begin{columns}
- \column{5.5cm}
- \includegraphics[width=0.9\linewidth]{img/env-highlighted.png}
- \column{5.5cm}
- \includegraphics[width=0.9\linewidth]{img/build-highlighted.png}
- \end{columns}
- \end{frame}
-
-
-
-
-
\begin{frame}{Example: Matplotlib (a Python visualization library) build dependencies}
\Wider[5em]{
%\vspace{5mm}
@@ -343,8 +325,7 @@
\vspace{3mm}\tiny From ``Attributing and Referencing (Research)
Software: Best Practices and Outlook from Inria'' (Alliez et
- al. 2019,
- \textcolor{blue}{\href{https://hal.archives-ouvertes.fr/hal-02135891}{hal-02135891}})
+ al. 2020, CiSE, DOI:\textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2019.2949413}{10.1109/MCSE.2019.2949413}}).
}
\end{frame}
@@ -379,10 +360,14 @@
\end{frame}
+
+
+ \ifdefined\longformat
\begin{frame}{Software citation automatically generated in paper (including Astropy)}
\centering
\includegraphics[width=0.8\linewidth]{img/software-cite.jpg}
\end{frame}
+ \fi
\begin{frame}{Software citation automatically generated in paper (including Astropy)}
\centering
\includegraphics[width=0.8\linewidth]{img/software-cite-highlighted.jpg}
@@ -393,6 +378,7 @@
\begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
\let\focusonhardware\undefined
+ \ifdefined\longformat
\begin{frame}{Input data source and integrity is documented and checked}
\small
\begin{columns}
@@ -418,6 +404,7 @@
\includegraphics[width=\linewidth]{img/inputs.png}
\end{columns}
\end{frame}
+ \fi
\begin{frame}{Input data source and integrity is documented and checked}
\small
@@ -463,7 +450,7 @@
-
+ \ifdefined\longformat
\begin{frame}{Reproducible science: Maneage is managed through a Makefile}
\small
\begin{columns}
@@ -530,6 +517,7 @@
\includegraphics[width=\linewidth]{img/reproducible-makefile-highlighted-1.png}
\end{columns}
\end{frame}
+ \fi
\begin{frame}{Reproducible science: Maneage is managed through a Makefile}
\small
\begin{columns}
@@ -592,7 +580,7 @@
\begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
\let\focusonpaper\undefined
-
+ \ifdefined\longformat
\begin{frame}{Values in final report/paper}
All analysis \alert{results} (numbers, plots, tables) written in
paper's PDF as \alert{\LaTeX{} macros}. They are thus
@@ -603,6 +591,7 @@
\vspace{0.4cm}
\includegraphics[width=\linewidth]{img/reproducible-latex.png}
\end{frame}
+ \fi
\begin{frame}{Values in final report/paper}
All analysis \alert{results} (numbers, plots, tables) written in
@@ -618,13 +607,14 @@
-
+ \ifdefined\longformat
\begin{frame}{Analysis step results/values concatenated into a single file.}
All \LaTeX{} macros come from a \alert{single file}.
\begin{center}
\includegraphics[width=0.6\linewidth]{img/reproducible-macros.png}
\end{center}
\end{frame}
+ \fi
\begin{frame}{Analysis step results/values concatenated into a single file.}
All \LaTeX{} macros come from a \alert{single file}.
\begin{center}
@@ -637,7 +627,7 @@
-
+ \ifdefined\longformat
\begin{frame}{Analysis results stored as \LaTeX{} macros}
The analysis scripts write/update the \LaTeX{} macro values
automatically.
@@ -645,6 +635,7 @@
\includegraphics[width=0.6\linewidth]{img/reproducible-write-macro.png}
\end{center}
\end{frame}
+ \fi
\begin{frame}{Analysis results stored as \LaTeX{} macros}
The analysis scripts write/update the \LaTeX{} macro values
automatically.
@@ -654,12 +645,35 @@
\end{frame}
+
%% Make demo.
- \begin{frame}
- \LARGE
- \vspace{1cm}
- \hfill Let's see how the analysis is managed in a hypothetical project...
+ \begin{frame}{Let's look at the data lineage to replicate Figure 1C (green/tool) of Menke+2020 \\(DOI:\href{https://doi.org/10.1101/2020.01.15.908111}{10.1101/2020.01.15.908111}), as done in \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}} for a demo.}
+ \begin{columns}
+ \column{0.55\linewidth}
+ \textcolor{blue}{ORIGINAL PLOT}
+
+ The Green plot shows the fraction of papers mentioning software tools from 1997 to 2019.
+ \column{0.45\linewidth}
+ \includegraphics[width=\linewidth]{img/tools-per-year-orig.jpg}
+ \end{columns}
+
+ \rule{\textwidth}{1pt}
+
+ \begin{columns}
+ \column{0.4\linewidth}
+ \textcolor{green!70!black}{OUR enhanced REPLICATION}
+
+ The green line is same as above but over their full historical range.
+
+ Red histogram is the number of papers studied in each year
+ \column{0.6\linewidth}
+ \vspace{1cm}
+ \includegraphics[width=\linewidth]{img/tools-per-year.pdf}
+ \end{columns}
\end{frame}
+
+
+ \ifdefined\longformat
\makedemoslide{img/data-lineage-1.pdf}
{Makefiles (\texttt{\*.mk}) keep contextually separate parts of the project, all imported into \texttt{top-make.mk}}
\makedemoslide{img/data-lineage-2.pdf}
@@ -673,37 +687,31 @@
\makedemoslide{img/data-lineage-6.pdf}
{Basic project info comes from \texttt{initialize.tex}.}
\makedemoslide{img/data-lineage-7.pdf}
- {Reported values about the downloaded inputs come from \texttt{download.tex}.}
+ {The paper includes some information about the plot.}
\makedemoslide{img/data-lineage-8.pdf}
- {... for example the number of rows in the second input (a catalog) of the project.}
+ {The final plotted data are calculated and stored in \texttt{tools-per-year.txt}.}
\makedemoslide{img/data-lineage-9.pdf}
- {The URL to download \texttt{input2.dat}, and a checksum to validate it, are stored in \texttt{INPUTS.conf}.}
+ {The plot's calculation is done on a formatted sub-set of the raw input data.}
\makedemoslide{img/data-lineage-10.pdf}
- {Reported values from first analysis steps stored in \texttt{analysis1.tex}.}
+ {The raw data that were downloaded are stored in XLSX format.}
\makedemoslide{img/data-lineage-11.pdf}
- {... for example the average of the numbers in \texttt{out-1b.dat}.}
+ {The download URL \emph{and} a \alert{checksum to validate} the raw inputs, are stored in \texttt{INPUTS.conf}.}
\makedemoslide{img/data-lineage-12.pdf}
- {But \texttt{out-1b.dat} itself depends on other files and a paramter (for example a multiple of sigma).}
+ {We also need to report the URL in the paper...}
\makedemoslide{img/data-lineage-13.pdf}
- {\texttt{out-1a.dat} is built from a downloaded dataset.}
+ {Some general info about the full dataset may also be reported.}
+ \fi
+
+ \ifdefined\longformat
+ \makedemoslide{img/data-lineage-14.pdf}
+ {We report the number of papers studied in a special year, desired year is stored in \texttt{.conf} file.}
+ \else
\makedemoslide{img/data-lineage-14.pdf}
- {Download URL and checksum of \texttt{input1.dat} also stored in \texttt{INPUTS.conf}.}
+ {All analysis steps cascade down to paper.pdf (URL and checksum of input in \texttt{INPUTS.conf}).}
+ \fi
+
\makedemoslide{img/data-lineage-15.pdf}
- {Reported values from second analysis steps stored in \texttt{analysis2.tex}.}
- \makedemoslide{img/data-lineage-16.pdf}
- {... for example the number of selected rows in \texttt{out-2b.dat}.}
- \makedemoslide{img/data-lineage-17.pdf}
- {\texttt{out-2b.dat} is derived from \texttt{out-1b.dat} (for example, rejected some of \texttt{out-1b.dat}'s rows).}
- \makedemoslide{img/data-lineage-18.pdf}
- {Reported values from third analysis steps stored in \texttt{analysis3.tex}.}
- \makedemoslide{img/data-lineage-19.pdf}
- {... for example measurements from both \texttt{out-3a.dat} and \texttt{out-3b.dat}.}
- \makedemoslide{img/data-lineage-20.pdf}
- {\texttt{out-3b.dat} is generated from an analysis on \texttt{out-2a.dat}.}
- \makedemoslide{img/data-lineage-21.pdf}
- {But \texttt{out-2a.dat} itself is generated from \texttt{input1.dat} and an analysis which has two settings.}
- \makedemoslide{img/data-lineage-22.pdf}
- {\texttt{out-3a.dat} also depends on \texttt{out-1a.dat} and an analysis with needs one parameter.}
+ {It is very easy to expand the project and add new analysis steps (this solution is scalable)}
@@ -742,7 +750,7 @@
-
+ \ifdefined\longformat
\begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
\newcommand{\projinit}{}
\begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
@@ -757,8 +765,18 @@
\newcommand{\githappy}{}
\begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
\newcommand{\gitverified}{}
+ \else
+ \newcommand{\projinit}{}
+ \newcommand{\projwork}{}
+ \newcommand{\tempevolve}{}
+ \newcommand{\mergewithtemp}{}
+ \newcommand{\tofuture}{}
+ \newcommand{\githappy}{}
+ \newcommand{\gitverified}{}
+ \fi
\begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+ \ifdefined\longformat
\begin{frame}{Two recent examples (publishing Git checksum in abstract)}
\begin{columns}
\column{0.5\linewidth}
@@ -769,6 +787,7 @@
\includegraphics[width=0.8\linewidth]{img/firstpage-mnras491.png}
\end{columns}
\end{frame}
+ \fi
\begin{frame}{Two recent examples (publishing Git checksum in abstract)}
\begin{columns}
@@ -804,12 +823,10 @@
\begin{itemize}
\item \alert{arXiv}: uploaded with the \LaTeX{} source to always
stay with the paper \\(for example
- \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). The
- file containing all macros must also be uploaded so arXiv's
- server can easily build the \LaTeX{} source.
+ \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}} or \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}).
\item \alert{Zenodo}: Along with all the input datasets (many
Gigabytes) and software \\(for example
- \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.3408481}{zenodo.3408481}}) and given a unique DOI.
+ \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.3872248}{zenodo.3872248}}) and given a unique DOI.
\end{itemize}
\end{frame}
@@ -817,39 +834,36 @@
- \begin{frame}{Project source and its execution}
- \begin{tcolorbox}
- Programs \textcolor{gray}{[here: Scientific projects]} must be
- written for \alert{people to read}...
-
- \hfill ...and only \emph{incidentally} for machines to
- \emph{execute}.
-
- \vspace{2mm}
- \hfill \footnotesize Harold Abelson, Structure and Interpretation of Computer Programs
- \end{tcolorbox}
- \end{frame}
-
-
-
-
-
- \begin{frame}[t]{General outline of using this system (for example \href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230})}
+ \begin{frame}[t]{General outline of using Maneage (for example \href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018})}
\vspace{1cm}
- \texttt{\$ git clone http://gitlab.com/makhlaghi/iau-symposium-355{ }{ }{ }{ }\textcolor{gray}{\# Import the project.}}\\
+ \texttt{\$ git clone https://gitlab.com/makhlaghi/maneage-paper{ }{ }{ }{ }\textcolor{gray}{\# Import the project.}}\\
- \pause
+ \ifdefined\longformat\pause\fi
\vspace{1.5cm}
\texttt{\$ ./project configure { }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# You will specify the build directory on your system,}}\\
\texttt{{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# and it will build all software (about 1.5 hours).}}
- \pause
+ \ifdefined\longformat\pause\fi
\vspace{1.5cm}
\texttt{\$ ./project make { }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# Does all the analysis and makes final PDF.}}\\
\end{frame}
+ \ifdefined\longformat
+ \begin{frame}{Project source and its execution}
+ \begin{tcolorbox}
+ Programs \textcolor{gray}{[here: Scientific projects]} must be
+ written for \alert{people to read}...
+
+ \hfill ...and only \emph{incidentally} for machines to
+ \emph{execute}.
+
+ \vspace{2mm}
+ \hfill \footnotesize Harold Abelson, Structure and Interpretation of Computer Programs
+ \end{tcolorbox}
+ \end{frame}
+ \fi
@@ -887,6 +901,52 @@
\end{itemize}
\end{frame}
+
+
+
+
+ \begin{frame}{Summary:}
+
+ Maneage and its principles are described in \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}.
+ It is a customizable template that will do the following steps/instructions (all in simple plain text files).
+ \begin{itemize}
+ \item \alert{Automatically downloads} the necessary
+ \emph{software} and \emph{data}.
+ \item \alert{Builds} the software in a \alert{closed
+ environment}.
+ \item Runs the software on data to \alert{generate} the final
+ \alert{research results}.
+ \item Modification of part of the analysis will only
+ result in re-doing that part, not the whole project.
+ \item Using LaTeX macros, paper's figures, tables and numbers
+ will be \alert{Automatically updated} after a change in
+ analysis. Allowing the scientist to focus on the scientific
+ interpretation.
+ \item The whole project is under \alert{version control} (Git)
+ to allow easy reversion to a previous state. This
+ \alert{encourages tests/experimentation} in the analysis.
+ \item The \alert{Git commit hash} of the project source, is
+ \alert{printed} in the published paper and \alert{saved on
+ output} data products. Ensuring the
+ integrity/reproducibility of the result.
+ \item \colorbox{green!30!white}{These slides are available at
+ \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro-short.pdf}}.}
+ \item \colorbox{green!15!white}{Longer slides are available at
+ \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.}
+ \end{itemize}
+
+ \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt,
+ top=1pt, bottom=1pt]
+ For a technical description of Maneage's implementation, as well
+ as a checklist to customize it, and tips on good practices,
+ please see this page:
+
+ \textcolor{blue}{\footnotesize\url{https://gitlab.com/maneage/project/-/blob/maneage/README-hacking.md}}
+ \end{tcolorbox}
+ \end{frame}
+
+
+\ifdefined\longformat
\begin{frame}{Existing technologies (Independent environment)}
\begin{itemize}
\setlength\itemsep{7mm}
@@ -910,13 +970,11 @@
\vspace{3mm}
In summary, they only \alert{store a built} environment (they are outputs, not good for archiving).
-
\end{frame}
-
\begin{frame}{Existing technologies (Package managers)}
\begin{itemize}
@@ -957,45 +1015,5 @@ In summary, they only \alert{store a built} environment (they are outputs, not g
\end{itemize}
\alert{Summary}: except for IPOL, most solutions surveyed have far too many dependencies to be usable \alert{beyond the immediate future}.
\end{frame}
-
-
-
-
- \begin{frame}{Summary:}
-
- Maneage is introduced as a customizable template that will do the
- following steps/instructions (all in simple plain text files).
- \begin{itemize}
- \item \alert{Automatically downloads} the necessary
- \emph{software} and \emph{data}.
- \item \alert{Builds} the software in a \alert{closed
- environment}.
- \item Runs the software on data to \alert{generate} the final
- \alert{research results}.
- \item A modification in one part of the analysis will only
- result in re-doing that part, not the whole project.
- \item Using LaTeX macros, paper's figures, tables and numbers
- will be \alert{Automatically updated} after a change in
- analysis. Allowing the scientist to focus on the scientific
- interpretation.
- \item The whole project is under \alert{version control} (Git)
- to allow easy reversion to a previous state. This
- \alert{encourages tests/experimentation} in the analysis.
- \item The \alert{Git commit hash} of the project source, is
- \alert{printed} in the published paper and \alert{saved on
- output} data products. Ensuring the
- integrity/reproducibility of the result.
- \item \colorbox{green!30!white}{These slides are available at
- \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.}
- \end{itemize}
-
- \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt,
- top=1pt, bottom=1pt]
- For a technical description of Maneage's implementation, as well
- as a checklist to customize it, and tips on good practices,
- please see this page:
-
- \textcolor{blue}{\footnotesize\url{https://gitlab.com/maneage/project/-/blob/maneage/README-hacking.md}}
- \end{tcolorbox}
- \end{frame}
+\fi
\end{document}