From 65c690fac9d1c7675fa06052ce96d34c2a39af3b Mon Sep 17 00:00:00 2001 From: Mohammad Akhlaghi Date: Thu, 5 Mar 2020 14:51:07 +0000 Subject: RDA Exposing Data Management Plans Working group --- reproducible-paper.tex | 182 ++++++++++++++++++++----------------------------- 1 file changed, 75 insertions(+), 107 deletions(-) diff --git a/reproducible-paper.tex b/reproducible-paper.tex index 4b8003c..a8209c1 100644 --- a/reproducible-paper.tex +++ b/reproducible-paper.tex @@ -46,7 +46,7 @@ %% Set the date and insitutional logos. \date{\footnotesize\vspace{0cm}\\ - \href{http://iactalks.iac.es/talks/view/1386}{Research Division Seminar}, IACTalks\\ \href{https://iac.es}{Instituto de Astrof\'isica de Canarias} (IAC),\\February 20th, 2020 \\ + \href{https://www.rd-alliance.org/groups/exposing-data-management-plans-wg}{RDA Exposing Data Management Plans Working group}\\March 5th, 2020 \\ \tiny\vspace{3mm} Most recent slides available in link below (this PDF is built from \href{https://gitlab.com/makhlaghi/reproducible-paper-slides}{Git commit} \gitcommit):\\ \footnotesize\textcolor{blue}{\url{http://akhlaghi.org/pdf/reproducible-paper.pdf}}\\ @@ -1270,112 +1270,6 @@ - \begin{frame} - \vspace{1cm} - \hfill\Large Usage in real-world instrument pipelines... - \end{frame} - - \begin{frame}{AstroCat (Tarragona, Spain), pipeline written by Ra\'ul Infante-Sainz, data from Aleix Roig} - \begin{columns} - \column{0.5\linewidth} - \includegraphics[width=\linewidth]{img/astrocat-telescope.jpg} - \column{0.5\linewidth} - \includegraphics[width=\linewidth]{img/astrocat-m101.jpg} - \end{columns} - \end{frame} - - \begin{frame}{IAC 80 (Teide Observatory), Ra\'ul Infante-Sainz \& Alberto Madrigal (Master student)} - \begin{columns} - \column{0.4\linewidth} - \includegraphics[width=\linewidth]{img/iac80.jpg} - \column{0.6\linewidth} - \includegraphics[width=\linewidth]{img/iac80-df4.jpg} - \end{columns} - \end{frame} - - \begin{frame}{HiPERCAM (GTC), Ra\'ul Infante-Sainz and Giulia Golini (Master Student)} - \begin{columns} - \column{0.4\linewidth} - \centering - \includegraphics[width=0.8\linewidth]{img/gtc.jpg} - \includegraphics[width=0.8\linewidth]{img/gtc-hipercam.jpg} - \column{0.6\linewidth} - \includegraphics[width=\linewidth]{img/gtc-hipercam-udg.jpg} - \end{columns} - \end{frame} - - \begin{frame}{SDSS PSF, Ra\'ul Infante-Sainz et al. 2020} - \begin{columns} - \column{0.4\linewidth} - \centering - \includegraphics[width=0.8\linewidth]{img/sdss.jpg} - \includegraphics[width=0.8\linewidth]{img/sdss-ccds.jpg} - \column{0.6\linewidth} - %% http://www.imc-srl.com/work-in-progress/ - \includegraphics[width=\linewidth]{img/sdss-psf.jpg} - \end{columns} - \end{frame} - - \begin{frame}{Remember Ra\'ul's paper that was shown before?} - \centering - \includegraphics[width=0.7\linewidth]{img/firstpage-mnras491-highlighted.png} - \end{frame} - - \begin{frame}{Subaru Telescope Hyper SuprimeCam PSF, Roberto Baena Gall\'e} - \begin{columns} - \column{0.4\linewidth} - \centering - \includegraphics[width=0.8\linewidth]{img/subaru.jpg} - \includegraphics[width=0.8\linewidth]{img/subaru-hsc.jpg} - \column{0.6\linewidth} - \includegraphics[width=\linewidth]{img/subaru-hsc-psf.jpg} - \end{columns} - \end{frame} - - \begin{frame}{OSIRIS (GTC), Ra\'ul Castellanos (Madrid), guided by Ra\'ul Infante-Sainz} - \begin{columns} - \column{0.4\linewidth} - \centering - \includegraphics[width=0.8\linewidth]{img/gtc.jpg} - \includegraphics[width=0.8\linewidth]{img/gtc-osiris.jpg} - \column{0.6\linewidth} - %% http://www.imc-srl.com/work-in-progress/ - \centering - \includegraphics[width=0.6\linewidth]{img/gtc-osiris-abel2390.jpg}\\ - \includegraphics[width=0.3\linewidth]{img/work-in-progress.jpg} - \end{columns} - \end{frame} - - \begin{frame}{Iranian National Obs. Lens Array: Zahra Sharbaf, Hamed Altafi, Elham Saremi, Surena Fatemi} - \begin{columns} - \column{0.4\linewidth} - \centering - \includegraphics[width=0.8\linewidth]{img/inola.jpg} - \includegraphics[width=0.8\linewidth]{img/inola-team.jpg} - \column{0.6\linewidth} - \centering - %% http://www.imc-srl.com/work-in-progress/ - \includegraphics[width=0.7\linewidth]{img/inola-ngc6946.jpg}\\ - \includegraphics[width=0.3\linewidth]{img/work-in-progress.jpg} - \end{columns} - \end{frame} - - \begin{frame}{Subaru Telescope Hyper SuprimeCam for the Low Surface Brightness, Mohammad Akhlaghi} - \begin{columns} - \column{0.4\linewidth} - \centering - \includegraphics[width=0.8\linewidth]{img/subaru.jpg} - \includegraphics[width=0.8\linewidth]{img/subaru-hsc.jpg} - \column{0.6\linewidth} - \centering - \includegraphics[width=0.7\linewidth]{img/subaru-hsc-dr2-star.png}\\ - \tiny{Image from HSC DR2, showing the problem of over-subtraction!}\\ - \includegraphics[width=0.3\linewidth]{img/work-in-progress.jpg} - \end{columns} - \end{frame} - - - \begin{frame}{Future prospects...} \large Adoption of reproducibility by many researchers will enable @@ -1470,6 +1364,80 @@ +\begin{frame}{Existing technologies (Independent environment)} + \begin{itemize} + \setlength\itemsep{7mm} + \item \textbf{Virtual machines:} + \begin{itemize} + \setlength\itemsep{3mm} + \item Contain the \alert{full operating system}, are thus very large ($\times$Gigabytes). + \item In \emph{binary} format (decoding a built VM's environment is extremely hard and inaccurate). + \end{itemize} + \item \textbf{Containers:} (For example Docker or Singularity) + \begin{itemize} + \setlength\itemsep{3mm} + \item Similar to virtual machines, but \alert{without low-level kernel} (use host's kernel). + \item \alert{Will fail} as soon as kernel is no longer supported\\(for example Docker currently only supports Linux kernel 3.10 and above \alert{from 2013}). + \item Good solutions for software engineers (that need to \emph{reproduce a bug's environment today}). + \item Docker is modular, needs root previlages (not available in HPCs), Dockerfiles allow incompleteness\\(especially in the common scenario of using the operating system's package manager, see next slide) + \item Singularity is monolithic and thus can be very large. + \item In \alert{binary} format (similar to VMs, especially when OS package managers are used). + \end{itemize} + \end{itemize} + + \vspace{3mm} +In summary, they only \alert{store a built} environment (they are outputs, not good for archiving). + +\end{frame} + + + + + +\begin{frame}{Existing technologies (Package managers)} + + \begin{itemize} + \item \textbf{Operating system package managers:} + \begin{itemize} + \setlength\itemsep{2mm} + \item For example \texttt{apt} or \texttt{yum} for Debian-based and RedHat-based GNU/Linux operating systems\\(the most common way to install software). + \item Tightly intertwined with the operating system's components\\(arbitrary control of software versions is not easily possible). + \item Older software (for example +5 years) is usually removed. + \end{itemize} + \item \textbf{Conda/Anaconda:} + \begin{itemize} + \setlength\itemsep{2mm} + \item Conda has build instructions for software and their dependencies. + \item But it doesn't go down to the C library or the lower-level components of operating system. + \item It is written in Python (can't be used later when current Python is depreciated). + \item Authors of Uhse+2019\footnote{\url{http://dx.doi.org/10.1002/cppb.20097}} report\footnote{\url{https://github.com/conda-forge/conda-forge.github.io/issues/787}} that their Conda environment breaks roughly every 3 months\\(Conda environments need to be updated to be used later! Breaking reproducibility). + \end{itemize} + \item \textbf{Nix, or GNU Guix:} + \begin{itemize} + \setlength\itemsep{2mm} + \item Deliver perfectly reproducible builds (bit-wise reproducibility of software), needs root access. + \item Doesn't \emph{require} documentation of dependencies. + \end{itemize} + \item \textbf{Spack:} Similar to Nix/Guix but written in Python. + \end{itemize} +\end{frame} + +\begin{frame}{Existing technologies (workflow tools)} + \begin{itemize} + \setlength\itemsep{4mm} + \item \textbf{Binder:} (\url{https://mybinder.org}) Docker+Conda. + \item \textbf{Galaxy:} (\url{https://galaxyproject.org}) A web-based user interface, primarily designed for genomics. The GUI make it hard to automate, and has too many dependencies. Very similar to GenePattern (2008 to 2017): with +40,000 users and $\sim4000$ jobs running per week, but cut due to funding. + \item \textbf{Sciunit:} (\url{https://sciunit.run}) Parses program binaries to try to infer their dependencies and copy them. + \item \textbf{Popper:} (\url{https://falsifiable.us}), HCL (previously used by GitHub Actions) + Conda + Docker. + \item \textbf{WholeTale:} (\url{https://wholetale.org}) Jupyter + Conda + Docker. + \item \textbf{Image Processing On Line (IPOL) journal:} The best example of publishing algorithms/methods I have seen, only useful for very basic/low-level software. + \end{itemize} + \alert{Summary}: except for IPOL, most solutions surveyed have far too many dependencies to be usable \alert{beyond the immediate future}. +\end{frame} + + + + \begin{frame}{Summary:} A fully working template/framework is introduced that will do the -- cgit v1.2.1