aboutsummaryrefslogtreecommitdiff
path: root/slides-intro-short.tex
diff options
context:
space:
mode:
Diffstat (limited to 'slides-intro-short.tex')
-rw-r--r--slides-intro-short.tex647
1 files changed, 445 insertions, 202 deletions
diff --git a/slides-intro-short.tex b/slides-intro-short.tex
index 3bc3f8f..3e0f8ff 100644
--- a/slides-intro-short.tex
+++ b/slides-intro-short.tex
@@ -1,6 +1,6 @@
% LaTeX source of slides on reproducible paper.
%
-% Copyright (C) 2020 Mohammad Akhlaghi <mohammad@akhlaghi.org>
+% Copyright (C) 2020-2022 Mohammad Akhlaghi <mohammad@akhlaghi.org>
%
% This LaTeX source is free software: you can redistribute it and/or
% modify it under the terms of the GNU General Public License as
@@ -37,34 +37,32 @@
(\textcolor{blue}{\href{https://arxiv.org/abs/#1}{arXiv:#1}})}}
%% Set the title
-\title{Introducing Maneage:\\
- Customizable framework for managing data lineage\\
- \vspace{2mm}{\small [RDA Europe Adoption grant recipient. Submitted to \href{https://www.computer.org/csdl/magazine/cs}{IEEE CiSE} (\textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}), Comments welcome]}
-}
+\title{\huge\textbf{BIG} Data, \textbf{BIG} responsibility
+ \\\vspace{2mm} \large Maneage: Managing data lineage for long-term and archivable reproducibility \\\vspace{1mm} \footnotesize (Published in CiSE 23 (3), pp 82-91: \textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2021.3072860}{DOI:10.1109/MCSE.2021.3072860}}, \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}})}
%% Set the author
-\author{\vspace{8mm}\\
+\author{\\
\href{https://akhlaghi.org}{Mohammad Akhlaghi}\\\vspace{0.5mm}
\footnotesize
- Instituto de Astrof\'isica de Canarias ({\scriptsize IAC}), Tenerife, Spain
+ Centro de Estudios de F\'isica del Cosmos de Arag\'on ({\scriptsize CEFCA}), Teruel, Spain\vspace{8mm}
}
%% Set the date and insitutional logos.
-\date{\footnotesize\vspace{0cm}\\
- \href{https://www.rd-alliance.org/rda-global-adoption-week-15-19-june-2020}{RDA Global Adoption week}\\June 18th, 2020\\
- \tiny\vspace{3mm}
+\date{\footnotesize\vspace{-9mm}\\
+ \textcolor{black}{Royal Observatory Coffee talk; Edinburgh}\\
+ \textcolor{black}{23rd of May 2023} \\
+ \tiny\vspace{10mm}
Most recent slides available in link below (this PDF is built from \href{http://git.maneage.org/slides-intro.git}{Git commit} \gitcommit):\\
\footnotesize\textcolor{blue}{\url{https://maneage.org/pdf/slides-intro-short.pdf}}\\
- \vspace{2mm}\hspace{-0.25cm}
- \raisebox{+0.4\height}{\includegraphics[width=2.5cm]{img/ministerio-ciencia.png}}
- \raisebox{+0.3\height}{\includegraphics[width=1.3cm]{img/sundial.png}}
+ \vspace{2mm}
+ \raisebox{+0.2\height}{\includegraphics[width=3.5cm]{img/jcava.jpg}}
+ \includegraphics[width=9.5mm]{img/cefca.png}
\includegraphics[width=1.2cm]{img/iac.png}
- \includegraphics[width=1cm]{img/eu-sundial.png}
\raisebox{0.13\height}{\includegraphics[width=1cm]{img/eu-regional.png}}
\raisebox{0.05\height}{\includegraphics[width=1cm]{img/eu-rdaeu4.png}}
\raisebox{+0.1\height}{\includegraphics[width=1.4cm]{img/rda-europe.png}}
- \raisebox{+1.3\height}{\includegraphics[width=1.4cm]{img/ull.png}}
- { }\raisebox{+0.5\height}{\includegraphics[width=2cm]{img/gobierno-canarias.png}}\\
+ \raisebox{+1\height}{\includegraphics[width=1.5cm]{img/aragon.png}}
+ \raisebox{+0.8\height}{\includegraphics[width=1.6cm]{img/gobierno-canarias.png}}\\
\vspace{1cm}
}
@@ -86,7 +84,6 @@
-
\begin{document}
\begin{frame}
@@ -95,34 +92,42 @@
\usebackgroundtemplate{ } %% undeclare it
- \begin{frame}{Challenges of the RDA-WDS Publishing Data Workflows WG {\small (DOI:\href{https://doi.org/10.1007/s00799-016-0178-2}{10.1007/s00799-016-0178-2})}}
- Challenges (also relevant to researchers, not just repositories)
- \begin{itemize}
- \item \emph{Bi-directional linking}: how to \alert{link data and publications}.
- \item \emph{\alert{Software management}:} how to manage, preserve, publish and cite software?
- \item \emph{Metrics:} \alert{how often} are data used.
- \item \emph{Incentives to researchers:} how to \alert{communicate benefits} of following good practices \alert{to researchers}.
- \end{itemize}
+
+
+
+ %% Introduction to OAJ and J-PAS
+ \begin{frame}{Our main project: \textbf{J-PAS} with Observatorio Astrof\'isico de Javalambre (OAJ)}
+ J-PAS will observe the northern sky in \alert{56 medium-band filters} ($\sim14$nm):
\begin{center}
- \includegraphics[width=4cm]{img/rda.png}\hspace{1cm}
- \includegraphics[width=4cm]{img/wds.jpg}
+ \includegraphics[width=0.9\linewidth]{img/oaj.jpg}
\end{center}
+ \end{frame}
- \ifdefined\longformat\pause\fi
+ \begin{frame}{LSST filter: 6 {\footnotesize(image from \href{https://speclite.readthedocs.io/en/latest/filters.html}{speclite docs})}:}
+ \begin{center}
+ \vspace{-3mm}
+ \includegraphics[width=0.8\linewidth]{img/filters-lsst.png}
+ \end{center}
+ \end{frame}
- ``\emph{We would like to see a workflow that results in all
- \textcolor{blue!30!green}{\bf scholarly objects being connected},
- linked, citable, and persistent to allow researchers to navigate
- smoothly and to \alert{\bf enable reproducible research}. This
- includes \alert{{\bf linkages} between documentation, code, data, and
- journal articles in an integrated environment}. Furthermore,
- in the ideal workflow, all of these objects need to be
- \alert{\bf well documented} to enable other researchers (or
- citizen scientists etc) to reuse the data for new
- discoveries.}''
+ \begin{frame}{J-PAS filters: 56 (Bonoli+2021: \href{https://ui.adsabs.harvard.edu/abs/2021A\%26A...653A..31B}{2021A\&A...653A..31B})}
+ \begin{center}
+ \includegraphics[width=\linewidth]{img/filters-jpas.pdf}
+ \end{center}
\end{frame}
+ \begin{frame}{Result: photo-\alert{spectra} of \alert{every pixel} of the non-Galactic northern sky (like an IFU)!}
+ \url{http://archive.cefca.es/catalogues/minijpas-pdr201912/navigator.html}
+
+ \vspace{3mm}
+ \includegraphics[width=\linewidth]{img/minijpas-web.png}
+ \end{frame}
+
+
+
+
+
\newcommand{\allopacity}{1}
\ifdefined\longformat
\begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
@@ -165,23 +170,16 @@
\begin{frame}{Science is a tricky business}
- \begin{center}
- \includegraphics[width=0.9\linewidth]{img/nature-cartoon.jpg}
- \end{center}
-
- \vspace{-0.3cm}\hfill
- {\tiny Image from nature.com
- (``\href{https://www.nature.com/articles/d41586-017-07522-z}{Five
- ways to fix statistics}'', Nov 2017)}\hspace{7mm}
- \vspace{-1mm}
\begin{tcolorbox}[boxsep=0pt,left=1mm,right=1mm,top=1mm,bottom=1mm]
- \small Data analysis [...] is a \alert{human
+ \large Data analysis [...] is a \alert{human
behavior}. Researchers who hunt hard enough will turn up a
result that fits statistical criteria, but their
\alert{discovery} will probably be a \alert{false positive}.
- \hfill Five ways to fix statistics, Nature, 551, Nov 2017.
+ \vspace{3mm}
+ \small
+ \hfill Five ways to fix statistics (Nature, 551, Nov 2017; DOI:\textcolor{blue}{\href{https://doi.org/10.1038/d41586-017-07522-z}{10.1038/d41586-017-07522-z}}).
\end{tcolorbox}
\end{frame}
@@ -189,6 +187,303 @@
+ \begin{frame}{``Reproducibility crisis'' in the sciences? (Baker 2016, Nature 533, 452, \textcolor{blue}{\href{https://doi.org/10.1038/533452a}{DOI:10.1038/533452a}})}
+ \Large
+ 1576 researchers participated in a survey by Nature, \alert{$90\%$} believed in a crisis!
+
+ \vspace{7mm}
+ \begin{center}
+ \begin{tabular}{ |l|r| }
+ \hline
+ Status & $\%$ agreed \\
+ \hline
+ \alert{Yes}, a significant crisis & \textcolor{red}{$52$} \\
+ \alert{Yes}, a slight crisis & \textcolor{red}{$38$} \\
+ Don't know & $7$ \\
+ No, there is no crisis & $3$ \\
+ \hline
+ \end{tabular}
+ \end{center}
+
+ \vspace{7mm}
+ Full PDF available at \textcolor{blue}{\url{https://www.nature.com/articles/533452a.pdf}}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}{Notebooks are not long-term solutions {\small (see appendices of Akhlaghi+2021: \href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018})}}
+ \begin{columns}
+ \column{0.4\linewidth}
+ \includegraphics[width=\linewidth]{img/dependencies-conda-initial.png}
+ \column{0.4\linewidth}
+ \includegraphics[width=0.9\linewidth]{img/dependencies-jupyter.png}
+ \column{0.2\linewidth}
+ Results from run on May 10th, 2022:
+
+ \pause
+ \vspace{7mm}
+ Conda setup:\\\alert{39 dependencies}
+
+ \pause
+ \vspace{7mm}
+ Jupyter (with Pip):\\\alert{61 dependencies}
+
+ \pause
+ \vspace{7mm}
+ Web browser has more dependencies; with fluid/\alert{evolving} web technologies.
+
+ \pause
+ \vspace{7mm}
+ They can contain \alert{binary} components.
+ \end{columns}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}{The dependency tree (Matplotlib is \emph{only one} dependency of Jupyter)}
+ \Wider[5em]{
+ %\vspace{5mm}
+ \begin{center}
+ \includegraphics[width=0.9\linewidth]{img/matplotlib.png}
+ \end{center}
+
+ \vspace{3mm}\tiny From ``Attributing and Referencing (Research)
+ Software: Best Practices and Outlook from Inria'' (Alliez et
+ al. 2020, CiSE, DOI:\textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2019.2949413}{10.1109/MCSE.2019.2949413}}).
+ }
+ \end{frame}
+
+
+
+
+ \begin{frame}{Are containers the solution? Yes, but ... for the short term}
+ \pause
+ \begin{itemize}
+ \setlength\itemsep{5mm}
+ \item Containers are \alert{large} (many giga-bytes)
+ \begin{itemize}
+ \setlength\itemsep{3mm}
+ \pause
+ \item \alert{Expensive} to archive!
+ \pause
+ \item Example: \textcolor{blue}{\href{https://is.ieis.tue.nl/staff/pvgorp/share}{SHARE}} (enabling remote connection to Virtual machines with project environment):
+ \begin{itemize}
+ \setlength\itemsep{2mm}
+ \item \alert{2nd place} in Elsevier's ``Executable paper grand challenge'' of 2011.
+ \item SHARE's image repository was taken offline in 2019!
+ \item Even the challenge webpage is no longer available: \textcolor{blue}{\href{http://www.executablepapers.com}{http://www.executablepapers.com}}
+ \end{itemize}
+ \end{itemize}
+ \pause
+ \item Container are \alert{binary} (tailored to certain kernels+CPUs)
+ \begin{itemize}
+ \setlength\itemsep{3mm}
+ \pause
+ \item Only guarantee the Long Term Release kernels.
+ \begin{itemize}
+ \setlength\itemsep{2mm}
+ \item Become un-readable, multi-gigabyte binary blobs in $\sim10$ years!
+ \item Even if you store them on Zenodo!
+ \end{itemize}
+ \pause
+ \item Only on common CPUs architectures.
+ \end{itemize}
+ \pause
+ \item Containers \alert{themselves} are \alert{hard to reproduce}.
+ \begin{itemize}
+ \item Example: \textcolor{blue}{\href{https://ui.adsabs.harvard.edu/abs/2020CSE....22a.102M}{2020CSE....22a.102M}} use `\texttt{FROM ubuntu:16.04}', but if run today, \textcolor{blue}{\href{https://partner-images.canonical.com/core/xenial}{images are from 2021}}.
+ \end{itemize}
+ \end{itemize}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}
+ \Large For \alert{longevity issues} with Jupyter, Conda, Containers and etc ...
+
+ \vspace{3mm}
+ As well as a survey of \alert{depreciated}/\alert{abandoned}/\alert{lost} solutions since the \alert{1990s} ...
+
+ \vspace{5mm}
+ \hfill ... see the appendices in \textcolor{blue}{\href{https://arxiv.org/pdf/2006.03018.pdf}{arXiv:2006.03018}}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}{Our solution: CiSE 23 (3), pp 82-91: \textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2021.3072860}{DOI:10.1109/MCSE.2021.3072860}}, \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}}
+ \begin{columns}
+ \column{0.4\linewidth}
+ \includegraphics[width=\linewidth]{img/maneage-paper.png}
+ \column{0.6\linewidth}
+ \includegraphics[width=\linewidth]{img/maneage-webpage.png}
+ \begin{center}
+ \huge{https://maneage.org}
+ \end{center}
+ \end{columns}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}{Recognition 1: RDA adoption grant (2019) to IAC for Maneage}
+ \begin{center}
+ \includegraphics[width=3cm]{img/rda.png}\hspace{1cm}
+ \includegraphics[width=1.8cm]{img/iac.png}
+ \includegraphics[width=\linewidth]{img/h2020.jpg}
+ \end{center}
+
+ \vspace{1cm} For Maneage, the \alert{IAC} is selected as
+ a \alert{Top European organization} funded to adopt RDA
+ Recommendations and Outputs.
+
+ \vspace{1cm}
+ \scriptsize
+ \begin{itemize}
+ \item Research Data Alliance was launched by the \alert{European
+ Commission}, NSF, National Institute of Standards and
+ Technology, and the Australian Government’s Department of
+ Innovation.
+ \item RDA Outputs are the technical and social infrastructure
+ solutions developed by RDA Working Groups or Interest
+ Groups that enable data sharing, exchange, and
+ interoperability.
+ \end{itemize}
+
+ \vspace{0.2cm}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}{Recognition 2: ``News and Views'' in Nature Astronomy (\textcolor{blue}{\href{https://doi.org/10.1038/s41550-021-01402-3}{DOI:10.1038/s41550-021-01402-3}})}
+ \begin{center}
+ \includegraphics[width=0.8\linewidth]{img/nature-astronomy.png}
+ \end{center}
+
+ \vspace{-2mm}
+ \footnotesize Free-to-read link: \textcolor{blue}{\url{https://rdcu.be/cmYVx}}
+ \end{frame}
+
+
+
+
+
+ \begin{frame}[t]{Definitions \& Clarification \hspace{1.6cm} {\normalsize(from the National Academies report in 2019, \href{http://doi.org/10.17226/25303}{DOI:10.17226/25303})}}
+ \vspace{-5mm}
+ \begin{columns}[t]
+ \column{0.5\linewidth}
+ \begin{center}
+ \large\textbf{Replicability (hardware/statistical)}
+ \rule{0.5\linewidth}{1pt}
+ \end{center}
+ \begin{itemize}
+ \setlength\itemsep{0.5em}
+ \item Involves data \alert{collection}.
+ \item Inherently includes \alert{measurements errors}\\(can
+ never be exactly reproduced).
+ \item Example: Raw telescope image/spectra.
+ \item \alert{\textbf{NOT DISCUSSED HERE.}}
+ \end{itemize}
+
+ \vspace{3.5mm}
+ \begin{center}
+ \vspace{-5mm}
+ \includegraphics[width=0.7\linewidth]{img/hale-prime-focus.jpg}\\
+ \vspace{-0.6mm}
+ \tiny \href{http://slittlefair.staff.shef.ac.uk/teaching/phy217/lectures/telescopes/L07/index.html}{http://slittlefair.staff.shef.ac.uk}
+ \end{center}
+
+ \column{0.5\linewidth}
+ \end{columns}
+ \end{frame}
+ \begin{frame}[t]{Definitions \& Clarification \hspace{1.6cm} {\normalsize(from the National Academies report in 2019, \href{http://doi.org/10.17226/25303}{DOI:10.17226/25303})}}
+ \vspace{-5mm}
+ \begin{columns}[t]
+ \column{0.5\linewidth}
+ \begin{center}
+ \large\textbf{Replicability (hardware/statistical)}
+ \rule{0.5\linewidth}{1pt}
+ \end{center}
+ \begin{itemize}
+ \setlength\itemsep{0.5em}
+ \item Involves data \alert{collection}.
+ \item Inherently includes \alert{measurements errors}\\(can
+ never be exactly reproduced).
+ \item Example: Raw telescope image/spectra.
+ \item \alert{\textbf{NOT DISCUSSED HERE.}}
+ \end{itemize}
+
+ \vspace{3.5mm}
+ \begin{center}
+ \vspace{-5mm}
+ \includegraphics[width=0.7\linewidth]{img/hale-prime-focus-marked.jpg}\\
+ \vspace{-0.6mm}
+ \tiny \href{http://slittlefair.staff.shef.ac.uk/teaching/phy217/lectures/telescopes/L07/index.html}{http://slittlefair.staff.shef.ac.uk}
+ \end{center}
+
+ \column{0.5\linewidth}
+ \end{columns}
+ \end{frame}
+ \begin{frame}[t]{Definitions \& Clarification \hspace{1.6cm} {\normalsize(from the National Academies report in 2019, \href{http://doi.org/10.17226/25303}{DOI:10.17226/25303})}}
+ \vspace{-5mm}
+ \begin{columns}[t]
+ \column{0.5\linewidth}
+ \begin{center}
+ \large\textbf{Replicability (hardware/statistical)}
+ \rule{0.5\linewidth}{1pt}
+ \end{center}
+ \begin{itemize}
+ \setlength\itemsep{0.5em}
+ \item Involves data \alert{collection}.
+ \item Inherently includes \alert{measurements errors}\\(can
+ never be exactly reproduced).
+ \item Example: Raw telescope image/spectra.
+ \item \alert{\textbf{NOT DISCUSSED HERE.}}
+ \end{itemize}
+
+ \vspace{3.5mm}
+ \begin{center}
+ \vspace{-5mm}
+ \includegraphics[width=0.7\linewidth]{img/hale-prime-focus.jpg}\\
+ \vspace{-0.6mm}
+ \tiny \href{http://slittlefair.staff.shef.ac.uk/teaching/phy217/lectures/telescopes/L07/index.html}{http://slittlefair.staff.shef.ac.uk}
+ \end{center}
+
+ \column{0.5\linewidth}
+ \begin{center}
+ \large\textbf{Reproducibility (Software/Deterministic)}
+ \rule{0.5\linewidth}{1pt}
+ \end{center}
+ \begin{itemize}
+ \setlength\itemsep{1em}
+ \item Involves data \alert{analysis}, or simulations.
+ \item Starts \alert{after} data is collected/digitized.
+ \item Example: $2+2=4$ (i.e., sum of datasets).
+ \item \textbf{\textcolor{green!50!black}{DISCUSSED HERE.}}
+ \end{itemize}
+
+ \centering
+ \vspace{3mm}
+ \includegraphics[width=0.88\linewidth]{img/binary-blue.jpg}\\
+ \vspace{-0.6mm}
+ \tiny \href{https://commons.wikimedia.org/wiki/File:Binary_blue.jpg}{Wikimedia Commons}
+ \end{columns}
+ \end{frame}
+
+
+
+
+
\begin{frame}{Founding criteria}
\begin{tcolorbox}[title=Basic/simple principle:]
\centering Science is defined by its METHOD, \alert{not} its
@@ -237,7 +532,7 @@
- \ifdefined\longformat
+
\begin{frame}{Predefined/exact software tools}
\small
\begin{columns}
@@ -251,28 +546,43 @@
reproducibility.
\end{tcolorbox}
- \vspace{2cm}
-
\begin{itemize}
- \setlength\itemsep{0.6cm}
+ \setlength\itemsep{2mm}
\item \emph{Containers} or \emph{Virtual Machines} are a
\alert{binary black box}.
+ \begin{itemize}
+ \setlength\itemsep{2mm}
+ \item e.g., with `\texttt{FROM ubuntu:16.04}' (released in April 2016),
+ \item in a \texttt{Dockerfile}, the OS image will come from (updated monthly!): \url{https://partner-images.canonical.com/core/xenial}
+ \end{itemize}
\item Maneage \alert{installs fixed versions} of all
- necessary research software and their dependencies.
+ necessary research software.
+ \begin{itemize}
+ \item Including their dependencies.
+ \item All the way down to the C compiler.
+ \end{itemize}
\item Installs similar environment on \alert{GNU/Linux}, or
\alert{macOS} systems.
- \item Works very much like a package manager (e.g.,
- \alert{\texttt{apt}} or \alert{\texttt{brew}}).
+ \item Works like a package manager (e.g.,
+ \alert{\texttt{apt}}, \alert{\texttt{brew}} or Conda).
+ \begin{itemize}
+ \item ... \alert{but (!)}, its not a third party package manager.
+ \item Build instructions are within same analysis project.
+ \item e.g., see Conda's build of Gnuastro (its gets updated behind your back): \url{https://anaconda.org/conda-forge/gnuastro/files}
+ \end{itemize}
+
+ \item Source code of all software in Maneage is archived on
+ \textcolor{blue}{\href{https://doi.org/10.5281/zenodo.3883409}{zenodo.3883409}}.
\end{itemize}
\column{5cm}
\includegraphics[width=\linewidth]{img/version.png}
\end{columns}
\end{frame}
- \fi
+
@@ -290,21 +600,36 @@
reproducibility.
\end{tcolorbox}
- \vspace{2cm}
-
\begin{itemize}
- \setlength\itemsep{0.6cm}
+ \setlength\itemsep{2mm}
\item \emph{Containers} or \emph{Virtual Machines} are a
\alert{binary black box}.
+ \begin{itemize}
+ \setlength\itemsep{2mm}
+ \item e.g., with `\texttt{FROM ubuntu:16.04}' (released in April 2016),
+ \item in a \texttt{Dockerfile}, the OS image will come from (updated monthly!): \url{https://partner-images.canonical.com/core/xenial}
+ \end{itemize}
\item Maneage \alert{installs fixed versions} of all
- necessary research software and their dependencies.
+ necessary research software.
+ \begin{itemize}
+ \item Including their dependencies.
+ \item All the way down to the C compiler.
+ \end{itemize}
\item Installs similar environment on \alert{GNU/Linux}, or
\alert{macOS} systems.
- \item Works very much like a package manager (e.g.,
- \alert{\texttt{apt}} or \alert{\texttt{brew}}).
+ \item Works like a package manager (e.g.,
+ \alert{\texttt{apt}}, \alert{\texttt{brew}} or Conda).
+ \begin{itemize}
+ \item ... \alert{but (!)}, its not a third party package manager.
+ \item Build instructions are within same analysis project.
+ \item e.g., see Conda's build of Gnuastro (its gets updated behind your back): \url{https://anaconda.org/conda-forge/gnuastro/files}
+ \end{itemize}
+
+ \item Source code of all software in Maneage is archived on
+ \textcolor{blue}{\href{https://doi.org/10.5281/zenodo.3883409}{zenodo.3883409}}.
\end{itemize}
\column{5cm}
@@ -316,25 +641,6 @@
- \begin{frame}{Example: Matplotlib (a Python visualization library) build dependencies}
- \Wider[5em]{
- %\vspace{5mm}
- \begin{center}
- \includegraphics[width=0.9\linewidth]{img/matplotlib.png}
- \end{center}
-
- \vspace{3mm}\tiny From ``Attributing and Referencing (Research)
- Software: Best Practices and Outlook from Inria'' (Alliez et
- al. 2020, CiSE, DOI:\textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2019.2949413}{10.1109/MCSE.2019.2949413}}).
- }
- \end{frame}
-
-
-
-
-
-
-
\begin{frame}{Advantages of this build system}
\begin{columns}
\column{11cm}
@@ -714,7 +1020,15 @@
{It is very easy to expand the project and add new analysis steps (this solution is scalable)}
+ \begin{frame}{Files organized in directories by context (here are some of the files discussed before)}
+ \centering
+ \includegraphics[width=0.85\linewidth]{img/figure-file-architecture-1.pdf}
+ \end{frame}
+ \begin{frame}{Files organized in directories by context (now with other project files and symbolic links)}
+ \centering
+ \includegraphics[width=0.85\linewidth]{img/figure-file-architecture-2.pdf}
+ \end{frame}
@@ -752,6 +1066,10 @@
\ifdefined\longformat
\begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+ \newcommand{\tomorrow}{1}
+ \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
+ \newcommand{\abstractify}{1}
+ \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
\newcommand{\projinit}{}
\begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
\newcommand{\projwork}{}
@@ -766,6 +1084,7 @@
\begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
\newcommand{\gitverified}{}
\else
+ \newcommand{\abstractify}{1}
\newcommand{\projinit}{}
\newcommand{\projwork}{}
\newcommand{\tempevolve}{}
@@ -818,15 +1137,22 @@
\alert{negligible} compared to a single figure in a paper
(usually $\sim100$ kilo-bytes).
- \vspace{1cm} The project's pipeline (customized Maneage) can be
+ \vspace{6mm} The project's pipeline (customized Maneage) can be
\alert{published} in
\begin{itemize}
\item \alert{arXiv}: uploaded with the \LaTeX{} source to always
stay with the paper \\(for example
- \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}} or \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}).
+ \textcolor{blue}{\href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230}},
+ \textcolor{blue}{\href{https://arxiv.org/abs/1911.01430}{arXiv:1911.01430}},
+ \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}},
+ \textcolor{blue}{\href{https://arxiv.org/abs/2007.11779}{arXiv:2007.11779}}\\
+ \textcolor{blue}{\href{https://arxiv.org/abs/2010.03742}{arXiv:2010.03742}},
+ \textcolor{blue}{\href{https://arxiv.org/abs/2112.14174}{arXiv:2112.14174}}).
\item \alert{Zenodo}: Along with all the input datasets (many
Gigabytes) and software \\(for example
- \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.3872248}{zenodo.3872248}}) and given a unique DOI.
+ \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.6533902}{zenodo.6533902}}, also see comments in arXiv links above) and given a unique DOI.
+ \item \alert{Software Heritage}: to archive the full version-controlled history of the project.\\(for example
+ {\small \textcolor{blue}{\href{https://archive.softwareheritage.org/swh:1:dir:33fea87068c1612daf011f161b97787b9a0df39f;origin=http://git.maneage.org/paper-concept.git/;visit=swh:1:snp:89af43c4b076a17d9298299f224247038af355ea;anchor=swh:1:rev:313db0b04bd3499f83d9e79fd7e92578cd367c2b}{swh:1:dir:33fea87068c1612daf011f161b97787b9a0df39fk}}})
\end{itemize}
\end{frame}
@@ -834,7 +1160,20 @@
- \begin{frame}[t]{General outline of using Maneage (for example \href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018})}
+ \begin{frame}{Software Heritage IDs (SWHID); persistent identifier for source code (or any text!)}
+
+ \vspace{5mm}
+ \includegraphics[width=\linewidth]{img/SWHIDs.png}
+ \vspace{5mm}
+
+ {\hfill\small For more details, see SoftwareHeritage FAQ (at \textcolor{blue}{\url{https://www.softwareheritage.org/faq}}})
+ \end{frame}
+
+
+
+
+
+ \begin{frame}[t]{Executing a Maneaged project (for example \href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018})}
\vspace{1cm}
\texttt{\$ git clone https://gitlab.com/makhlaghi/maneage-paper{ }{ }{ }{ }\textcolor{gray}{\# Import the project.}}\\
@@ -850,21 +1189,6 @@
- \ifdefined\longformat
- \begin{frame}{Project source and its execution}
- \begin{tcolorbox}
- Programs \textcolor{gray}{[here: Scientific projects]} must be
- written for \alert{people to read}...
-
- \hfill ...and only \emph{incidentally} for machines to
- \emph{execute}.
-
- \vspace{2mm}
- \hfill \footnotesize Harold Abelson, Structure and Interpretation of Computer Programs
- \end{tcolorbox}
- \end{frame}
- \fi
-
\begin{frame}{Future prospects...}
@@ -905,115 +1229,34 @@
- \begin{frame}{Summary:}
- Maneage and its principles are described in \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}.
- It is a customizable template that will do the following steps/instructions (all in simple plain text files).
+
+
+
+
+ \begin{frame}{Summary:}
+ Maneage (\textcolor{blue}{\url{https://maneage.org}}) is a customizable template that will for research or data reduction:
\begin{itemize}
- \item \alert{Automatically downloads} the necessary
- \emph{software} and \emph{data}.
- \item \alert{Builds} the software in a \alert{closed
- environment}.
- \item Runs the software on data to \alert{generate} the final
- \alert{research results}.
- \item Modification of part of the analysis will only
- result in re-doing that part, not the whole project.
- \item Using LaTeX macros, paper's figures, tables and numbers
- will be \alert{Automatically updated} after a change in
- analysis. Allowing the scientist to focus on the scientific
- interpretation.
- \item The whole project is under \alert{version control} (Git)
- to allow easy reversion to a previous state. This
- \alert{encourages tests/experimentation} in the analysis.
- \item The \alert{Git commit hash} of the project source, is
- \alert{printed} in the published paper and \alert{saved on
- output} data products. Ensuring the
- integrity/reproducibility of the result.
- \item \colorbox{green!30!white}{These slides are available at
- \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro-short.pdf}}.}
- \item \colorbox{green!15!white}{Longer slides are available at
- \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.}
+ \item \alert{Automatically downloads} the necessary \emph{software} and \emph{data}.
+ \item \alert{Builds} the software in a \alert{closed environment}.
+ \item Runs the software on data to \alert{generate} the final \alert{research results}.
+ \item Modification of part of the analysis will only result in re-doing that part, not the whole project.
+ \item Using LaTeX macros, paper's figures, tables and numbers will be \alert{Automatically updated}.
+ \item The whole project is under \alert{version control} (Git) to allow easy reversion to a previous state. This \alert{encourages tests/experimentation} in the analysis.
+ \item The \alert{Git commit hash} of the project source, is \alert{printed} in the published paper and \alert{saved on output} data products. Ensuring the integrity/reproducibility of the result.
+ \item \colorbox{green!30!white}{These slides are available at \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro-short.pdf}}.}
+ \item \colorbox{green!15!white}{Longer slides are available at \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.}
+ \begin{itemize}
+ \item YouTube recording (May 2021): \textcolor{blue}{\url{https://www.youtube.com/watch?v=XdhRUhoMqw0}}
+ \end{itemize}
+ \item \colorbox{purple!20!white}{\small Matrix-protocol chat room: \texttt{\#maneage-general:matrix.org}}
\end{itemize}
\begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt,
top=1pt, bottom=1pt]
- For a technical description of Maneage's implementation, as well
- as a checklist to customize it, and tips on good practices,
- please see this page:
+ For a technical description of Maneage's implementation, as well as a checklist to customize it, and tips on good practices, please see this page:
\textcolor{blue}{\footnotesize\url{https://gitlab.com/maneage/project/-/blob/maneage/README-hacking.md}}
\end{tcolorbox}
\end{frame}
-
-
-\ifdefined\longformat
-\begin{frame}{Existing technologies (Independent environment)}
- \begin{itemize}
- \setlength\itemsep{7mm}
- \item \textbf{Virtual machines:}
- \begin{itemize}
- \setlength\itemsep{3mm}
- \item Contain the \alert{full operating system}, are thus very large ($\times$Gigabytes).
- \item In \emph{binary} format (decoding a built VM's environment is extremely hard and inaccurate).
- \end{itemize}
- \item \textbf{Containers:} (For example Docker or Singularity)
- \begin{itemize}
- \setlength\itemsep{3mm}
- \item Similar to virtual machines, but \alert{without low-level kernel} (use host's kernel).
- \item \alert{Will fail} as soon as kernel is no longer supported\\(for example Docker currently only supports Linux kernel 3.10 and above \alert{from 2013}).
- \item Good solutions for software engineers (that need to \emph{reproduce a bug's environment today}).
- \item Docker is modular, needs root previlages (not available in HPCs), Dockerfiles allow incompleteness\\(especially in the common scenario of using the operating system's package manager, see next slide)
- \item Singularity is monolithic and thus can be very large.
- \item In \alert{binary} format (similar to VMs, especially when OS package managers are used).
- \end{itemize}
- \end{itemize}
-
- \vspace{3mm}
-In summary, they only \alert{store a built} environment (they are outputs, not good for archiving).
-\end{frame}
-
-
-
-
-\begin{frame}{Existing technologies (Package managers)}
-
- \begin{itemize}
- \item \textbf{Operating system package managers:}
- \begin{itemize}
- \setlength\itemsep{2mm}
- \item For example \texttt{apt} or \texttt{yum} for Debian-based and RedHat-based GNU/Linux operating systems\\(the most common way to install software).
- \item Tightly intertwined with the operating system's components\\(arbitrary control of software versions is not easily possible).
- \item Older software (for example +5 years) is usually removed.
- \end{itemize}
- \item \textbf{Conda/Anaconda:}
- \begin{itemize}
- \setlength\itemsep{2mm}
- \item Conda has build instructions for software and their dependencies.
- \item But it doesn't go down to the C library or the lower-level components of operating system.
- \item It is written in Python (can't be used later when current Python is depreciated).
- \item Authors of Uhse+2019\footnote{\url{http://dx.doi.org/10.1002/cppb.20097}} report\footnote{\url{https://github.com/conda-forge/conda-forge.github.io/issues/787}} that their Conda environment breaks roughly every 3 months\\(Conda environments need to be updated to be used later! Breaking reproducibility).
- \end{itemize}
- \item \textbf{Nix, or GNU Guix:}
- \begin{itemize}
- \setlength\itemsep{2mm}
- \item Deliver perfectly reproducible builds (bit-wise reproducibility of software), needs root access.
- \item Doesn't \emph{require} documentation of dependencies.
- \end{itemize}
- \item \textbf{Spack:} Similar to Nix/Guix but written in Python.
- \end{itemize}
-\end{frame}
-
-\begin{frame}{Existing technologies (workflow tools)}
- \begin{itemize}
- \setlength\itemsep{4mm}
- \item \textbf{Binder:} (\url{https://mybinder.org}) Docker+Conda.
- \item \textbf{Galaxy:} (\url{https://galaxyproject.org}) A web-based user interface, primarily designed for genomics. The GUI make it hard to automate, and has too many dependencies. Very similar to GenePattern (2008 to 2017): with +40,000 users and $\sim4000$ jobs running per week, but cut due to funding.
- \item \textbf{Sciunit:} (\url{https://sciunit.run}) Parses program binaries to try to infer their dependencies and copy them.
- \item \textbf{Popper:} (\url{https://falsifiable.us}), HCL (previously used by GitHub Actions) + Conda + Docker.
- \item \textbf{WholeTale:} (\url{https://wholetale.org}) Jupyter + Conda + Docker.
- \item \textbf{Image Processing On Line (IPOL) journal:} The best example of publishing algorithms/methods I have seen, only useful for very basic/low-level software.
- \end{itemize}
- \alert{Summary}: except for IPOL, most solutions surveyed have far too many dependencies to be usable \alert{beyond the immediate future}.
-\end{frame}
-\fi
\end{document}