From 5a9de3b5a3b263f1042148b84f8e9adedc652396 Mon Sep 17 00:00:00 2001 From: Mohammad Akhlaghi Date: Thu, 2 Dec 2021 05:00:33 +0100 Subject: Updated slides for talk at Paris Observatory --- img/cpu-arch-astropy.png | Bin 506421 -> 240440 bytes img/cpu-arch-gnuastro.png | Bin 460828 -> 220961 bytes img/distros-astropy.pdf | Bin 40387 -> 52228 bytes img/distros-gnuastro.pdf | Bin 61510 -> 75718 bytes img/maneage-paper.png | Bin 0 -> 322232 bytes img/maneage-webpage.png | Bin 0 -> 111125 bytes slides-intro.tex | 273 ++++++++++++++++++++++++++-------------------- 7 files changed, 152 insertions(+), 121 deletions(-) create mode 100644 img/maneage-paper.png create mode 100644 img/maneage-webpage.png diff --git a/img/cpu-arch-astropy.png b/img/cpu-arch-astropy.png index 28f26fb..d13e4df 100644 Binary files a/img/cpu-arch-astropy.png and b/img/cpu-arch-astropy.png differ diff --git a/img/cpu-arch-gnuastro.png b/img/cpu-arch-gnuastro.png index f222380..adbdd2f 100644 Binary files a/img/cpu-arch-gnuastro.png and b/img/cpu-arch-gnuastro.png differ diff --git a/img/distros-astropy.pdf b/img/distros-astropy.pdf index 3cbea3f..db82659 100644 Binary files a/img/distros-astropy.pdf and b/img/distros-astropy.pdf differ diff --git a/img/distros-gnuastro.pdf b/img/distros-gnuastro.pdf index 2ada969..0c083fd 100644 Binary files a/img/distros-gnuastro.pdf and b/img/distros-gnuastro.pdf differ diff --git a/img/maneage-paper.png b/img/maneage-paper.png new file mode 100644 index 0000000..266133e Binary files /dev/null and b/img/maneage-paper.png differ diff --git a/img/maneage-webpage.png b/img/maneage-webpage.png new file mode 100644 index 0000000..a262fd4 Binary files /dev/null and b/img/maneage-webpage.png differ diff --git a/slides-intro.tex b/slides-intro.tex index 8a22ccb..35d71cc 100644 --- a/slides-intro.tex +++ b/slides-intro.tex @@ -35,7 +35,7 @@ %% Set the title \title{\huge\textbf{BIG} Data, \textbf{BIG} responsibility - \\\vspace{2mm} \large Maneage: \emph{Man}aging data lin\emph{eage} for long-term and archivable reproducibility \\\vspace{1mm} \footnotesize (\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}, \href{https://doi.org/10.1109/MCSE.2021.3072860}{DOI:10.1109/MCSE.2021.3072860})} + \\\vspace{2mm} \large Maneage: \emph{Man}aging data lin\emph{eage} for long-term and archivable reproducibility \\\vspace{1mm} \footnotesize (Published in CiSE 23 (3), pp 82-91: \textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2021.3072860}{DOI:10.1109/MCSE.2021.3072860}}, \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}})} %% Set the author \author{\\ @@ -46,8 +46,8 @@ %% Set the date and insitutional logos. \date{\footnotesize\vspace{-5mm}\\ - \textcolor{black}{SoftwareHeritage 5th Anniversary}\\ - \textcolor{black}{November 30th, 2021 (Inria, Paris)} \\ + \textcolor{black}{S\'eminaires LERMA}\\ + \textcolor{black}{December 2nd, 2021 (Paris Observatory)} \\ \tiny\vspace{9mm} Most recent slides available in link below (this PDF is built from \href{http://git.maneage.org/slides-intro.git}{Git commit} \gitcommit):\\ \footnotesize\textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}\\ @@ -90,9 +90,6 @@ \end{frame} - - - \begin{frame}{Let's start with this nice image of the Wirlpool galaxy (M51): \small{\url{https://i.redd.it/jfqgpqg0hfk11.jpg}}} \begin{center} \includegraphics[width=0.8\linewidth]{img/m51-amateur.jpg} @@ -239,6 +236,62 @@ for computational reproducibility] + \begin{frame}{Our solution: CiSE 23 (3), pp 82-91: \textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2021.3072860}{DOI:10.1109/MCSE.2021.3072860}}, \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}} + \begin{columns} + \column{0.4\linewidth} + \includegraphics[width=\linewidth]{img/maneage-paper.png} + \column{0.6\linewidth} + \includegraphics[width=\linewidth]{img/maneage-webpage.png} + \begin{center} + \huge{https://maneage.org} + \end{center} + \end{columns} + \end{frame} + + + + \begin{frame}{Recognition 1: RDA adoption grant (2019) to IAC for Maneage} + \begin{center} + \includegraphics[width=3cm]{img/rda.png}\hspace{1cm} + \includegraphics[width=1.8cm]{img/iac.png} + \includegraphics[width=\linewidth]{img/h2020.jpg} + \end{center} + + \vspace{1cm} For Maneage, the \alert{IAC} is selected as + a \alert{Top European organization} funded to adopt RDA + Recommendations and Outputs. + + \vspace{1cm} + \scriptsize + \begin{itemize} + \item Research Data Alliance was launched by the \alert{European + Commission}, NSF, National Institute of Standards and + Technology, and the Australian Government’s Department of + Innovation. + \item RDA Outputs are the technical and social infrastructure + solutions developed by RDA Working Groups or Interest + Groups that enable data sharing, exchange, and + interoperability. + \end{itemize} + + \vspace{0.2cm} + \end{frame} + + + + + + \begin{frame}{Recognition 2: ``News and Views'' in Nature Astronomy (\textcolor{blue}{\href{https://doi.org/10.1038/s41550-021-01402-3}{DOI:10.1038/s41550-021-01402-3}})} + \begin{center} + \includegraphics[width=0.8\linewidth]{img/nature-astronomy.png} + \end{center} + + \vspace{-2mm} + \footnotesize Free-to-read link: \textcolor{blue}{\url{https://rdcu.be/cmYVx}} + \end{frame} + + + \begin{frame}[t]{Definitions \& Clarification \hspace{1.6cm} {\normalsize(from the National Academies report in 2019, \href{http://doi.org/10.17226/25303}{DOI:10.17226/25303})}} @@ -355,14 +408,15 @@ for computational reproducibility] \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} \newcommand{\sver}{} \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} - \begin{frame}{Different package managers have different versions of software (repology.org, 2019/11/20)} + \begin{frame}[t]{Different package managers have different versions of software (repology.org, 2021/12/02)} \begin{columns} \column{7cm} \center Astropy\\ - \includegraphics[width=2.2cm]{img/distros-astropy.pdf} + \includegraphics[width=2.8cm]{img/distros-astropy.pdf} \column{7cm} \center GNU Astronomy Utilities (Gnuastro)\\ - \includegraphics[width=2.7cm]{img/distros-gnuastro.pdf} + \includegraphics[trim={0 8cm 0 0}, clip, width=2.9cm]{img/distros-gnuastro.pdf} + \includegraphics[trim={0 0 0 16cm}, clip, width=2.9cm]{img/distros-gnuastro.pdf} \end{columns} \end{frame} \newcommand{\srep}{} @@ -387,13 +441,13 @@ for computational reproducibility] al. 2020, CiSE, DOI:\textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2019.2949413}{10.1109/MCSE.2019.2949413}}). } \end{frame} - \begin{frame}{Impact of ``Dependency hell'' on native building in various hardware (CPU architectures)} + \begin{frame}{Impact of ``Dependency hell'' on native building in various hardware (CPU architectures), retrieved from Debian on 2021/12/02} \begin{columns} \column{7cm} - \includegraphics[width=0.9\linewidth]{img/cpu-arch-astropy.png} + \includegraphics[width=\linewidth]{img/cpu-arch-astropy.png} Astropy depends on Matplotlib \column{6cm} - \includegraphics[width=0.9\linewidth]{img/cpu-arch-gnuastro.png} + \includegraphics[width=1.05\linewidth]{img/cpu-arch-gnuastro.png} GNU Astronomy Utilities doesn't. \end{columns} \end{frame} @@ -1237,6 +1291,7 @@ for computational reproducibility] + \begin{frame}{Publication of the project} A reproducible project using Maneage will have the following @@ -1251,19 +1306,25 @@ for computational reproducibility] \alert{negligible} compared to a single figure in a paper (usually $\sim100$ kilo-bytes). + \pause + \vspace{7mm} The project's pipeline (customized Maneage) can be \alert{published} in \begin{itemize} \item \alert{arXiv}: uploaded with the \LaTeX{} source to always stay with the paper \\(for example - \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). The - file containing all macros must also be uploaded so arXiv's - server can easily build the \LaTeX{} source. - \item \alert{Software Heritage} which is a long-term archival repository for source code, providing permanent links to cite any part of the code. - For example see the \textcolor{blue}{\href{https://archive.softwareheritage.org/browse/origin/directory/?origin_url=https://gitlab.com/makhlaghi/maneage-paper.git}{Maneage paper's source there}}. + \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}} or \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}). \item \alert{Zenodo}: Along with all the input datasets (many Gigabytes) and software \\(for example - \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.3408481}{zenodo.3408481}}) and given a unique DOI. + \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.3872247}{zenodo.3872247}}) and given a unique DOI. + \begin{itemize} + \item ... and put links to data in paper! See ending of caption of Figure 1 in the \textcolor{blue}{\href{https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9403875}{Maneage paper}}. + \end{itemize} + \item \alert{Software Heritage}: to archive the full version-controlled history of the project.\\(for example + {\small \textcolor{blue}{\href{https://archive.softwareheritage.org/swh:1:dir:33fea87068c1612daf011f161b97787b9a0df39f;origin=http://git.maneage.org/paper-concept.git/;visit=swh:1:snp:89af43c4b076a17d9298299f224247038af355ea;anchor=swh:1:rev:313db0b04bd3499f83d9e79fd7e92578cd367c2b}{swh:1:dir:33fea87068c1612daf011f161b97787b9a0df39fk}}}) + \begin{itemize} + \item ... and put links to exact parts of the code! See caption of Listing 1 in the \textcolor{blue}{\href{https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9403875}{Maneage paper}}. + \end{itemize} \end{itemize} \end{frame} @@ -1344,32 +1405,44 @@ for computational reproducibility] - \begin{frame}{RDA adoption grant (2019) to IAC for Maneage} - \begin{center} - \includegraphics[width=3cm]{img/rda.png}\hspace{1cm} - \includegraphics[width=1.8cm]{img/iac.png} - \includegraphics[width=\linewidth]{img/h2020.jpg} - \end{center} - \vspace{1cm} For Maneage, the \alert{IAC} is selected as - a \alert{Top European organization} funded to adopt RDA - Recommendations and Outputs. + \begin{frame}{Summary:} - \vspace{1cm} - \scriptsize - \begin{itemize} - \item Research Data Alliance was launched by the \alert{European - Commission}, NSF, National Institute of Standards and - Technology, and the Australian Government’s Department of - Innovation. - \item RDA Outputs are the technical and social infrastructure - solutions developed by RDA Working Groups or Interest - Groups that enable data sharing, exchange, and - interoperability. - \end{itemize} + Maneage is introduced as a customizable template that will do the + following steps/instructions (all in simple plain text files). + \begin{itemize} + \item \alert{Automatically downloads} the necessary + \emph{software} and \emph{data}. + \item \alert{Builds} the software in a \alert{closed + environment}. + \item Runs the software on data to \alert{generate} the final + \alert{research results}. + \item Only parts affected by a modifcation are re-done. + \item Using LaTeX macros, paper's figures, tables and numbers + will be \alert{Automatically updated}. + \item The whole project is under \alert{version control} (Git) + \alert{encouraging tests/experimentation}. + \item The \alert{Git commit hash} of the project source, is + \alert{printed} in the paper and \alert{on output} data + products. + \item \colorbox{green!30!white}{These slides are available at + \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.} + \end{itemize} - \vspace{0.2cm} + \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, + top=1pt, bottom=1pt] + For a technical description of Maneage's implementation, as well + as a checklist to customize it, and tips on good practices, + please see this page: + + \textcolor{blue}{\url{https://gitlab.com/maneage/project/-/blob/maneage/README-hacking.md}} + + \vspace{3mm} + \hfill \colorbox{green!30!white}{Feel free to contact me: \textcolor{blue}{\large{\url{mohammad@akhlaghi.org}}}} + \end{tcolorbox} \end{frame} +\end{document} + @@ -1405,91 +1478,49 @@ In summary, they only \alert{store a built} environment (they are outputs, not g -\begin{frame}{Existing technologies (Package managers)} - - \begin{itemize} - \item \textbf{Operating system package managers:} - \begin{itemize} - \setlength\itemsep{2mm} - \item For example \texttt{apt} or \texttt{yum} for Debian-based and RedHat-based GNU/Linux operating systems\\(the most common way to install software). - \item Tightly intertwined with the operating system's components\\(arbitrary control of software versions is not easily possible). - \item Older software (for example +5 years) is usually removed. - \end{itemize} - \item \textbf{Conda/Anaconda:} - \begin{itemize} - \setlength\itemsep{2mm} - \item Conda has build instructions for software and their dependencies. - \item But it doesn't go down to the C library or the lower-level components of operating system. - \item It is written in Python (can't be used later when current Python is depreciated). - \item Authors of Uhse+2019\footnote{\url{http://dx.doi.org/10.1002/cppb.20097}} report\footnote{\url{https://github.com/conda-forge/conda-forge.github.io/issues/787}} that their Conda environment breaks roughly every 3 months\\(Conda environments need to be updated to be used later! Breaking reproducibility). - \end{itemize} - \item \textbf{Nix, or GNU Guix:} - \begin{itemize} - \setlength\itemsep{2mm} - \item Deliver perfectly reproducible builds (bit-wise reproducibility of software), needs root access. - \item Doesn't \emph{require} documentation of dependencies. - \end{itemize} - \item \textbf{Spack:} Similar to Nix/Guix but written in Python. - \end{itemize} -\end{frame} - -\begin{frame}{Existing technologies (workflow tools)} - \begin{itemize} - \setlength\itemsep{4mm} - \item \textbf{Binder:} (\url{https://mybinder.org}) Docker+Conda. - \item \textbf{Galaxy:} (\url{https://galaxyproject.org}) A web-based user interface, primarily designed for genomics. The GUI make it hard to automate, and has too many dependencies. Very similar to GenePattern (2008 to 2017): with +40,000 users and $\sim4000$ jobs running per week, but cut due to funding. - \item \textbf{Sciunit:} (\url{https://sciunit.run}) Parses program binaries to try to infer their dependencies and copy them. - \item \textbf{Popper:} (\url{https://falsifiable.us}), HCL (previously used by GitHub Actions) + Conda + Docker. - \item \textbf{WholeTale:} (\url{https://wholetale.org}) Jupyter + Conda + Docker. - \item \textbf{Image Processing On Line (IPOL) journal:} The best example of publishing algorithms/methods I have seen, only useful for very basic/low-level software. - \end{itemize} - \alert{Summary}: except for IPOL, most solutions surveyed have far too many dependencies to be usable \alert{beyond the immediate future}. +%\begin{frame}{Existing technologies (Package managers)} +% +% \begin{itemize} +% \item \textbf{Operating system package managers:} +% \begin{itemize} +% \setlength\itemsep{2mm} +% \item For example \texttt{apt} or \texttt{yum} for Debian-based and RedHat-based GNU/Linux operating systems\\(the most common way to install software). +% \item Tightly intertwined with the operating system's components\\(arbitrary control of software versions is not easily possible). +% \item Older software (for example +5 years) is usually removed. +% \end{itemize} +% \item \textbf{Conda/Anaconda:} +% \begin{itemize} +% \setlength\itemsep{2mm} +% \item Conda has build instructions for software and their dependencies. +% \item But it doesn't go down to the C library or the lower-level components of operating system. +% \item It is written in Python (can't be used later when current Python is depreciated). +% \item Authors of Uhse+2019\footnote{\url{http://dx.doi.org/10.1002/cppb.20097}} report\footnote{\url{https://github.com/conda-forge/conda-forge.github.io/issues/787}} that their Conda environment breaks roughly every 3 months\\(Conda environments need to be updated to be used later! Breaking reproducibility). +% \end{itemize} +% \item \textbf{Nix, or GNU Guix:} +% \begin{itemize} +% \setlength\itemsep{2mm} +% \item Deliver perfectly reproducible builds (bit-wise reproducibility of software), needs root access. +% \item Doesn't \emph{require} documentation of dependencies. +% \end{itemize} +% \item \textbf{Spack:} Similar to Nix/Guix but written in Python. +% \end{itemize} +%\end{frame} +% +%\begin{frame}{Existing technologies (workflow tools)} +% \begin{itemize} +% \setlength\itemsep{4mm} +% \item \textbf{Binder:} (\url{https://mybinder.org}) Docker+Conda. +% \item \textbf{Galaxy:} (\url{https://galaxyproject.org}) A web-based user interface, primarily designed for genomics. The GUI make it hard to automate, and has too many dependencies. Very similar to GenePattern (2008 to 2017): with +40,000 users and $\sim4000$ jobs running per week, but cut due to funding. +% \item \textbf{Sciunit:} (\url{https://sciunit.run}) Parses program binaries to try to infer their dependencies and copy them. +% \item \textbf{Popper:} (\url{https://falsifiable.us}), HCL (previously used by GitHub Actions) + Conda + Docker. +% \item \textbf{WholeTale:} (\url{https://wholetale.org}) Jupyter + Conda + Docker. +% \item \textbf{Image Processing On Line (IPOL) journal:} The best example of publishing algorithms/methods I have seen, only useful for very basic/low-level software. +% \end{itemize} +% \alert{Summary}: except for IPOL, most solutions surveyed have far too many dependencies to be usable \alert{beyond the immediate future}. \end{frame} - - \begin{frame}{Summary:} - - Maneage is introduced as a customizable template that will do the - following steps/instructions (all in simple plain text files). - \begin{itemize} - \item \alert{Automatically downloads} the necessary - \emph{software} and \emph{data}. - \item \alert{Builds} the software in a \alert{closed - environment}. - \item Runs the software on data to \alert{generate} the final - \alert{research results}. - \item Only parts affected by a modifcation are re-done. - \item Using LaTeX macros, paper's figures, tables and numbers - will be \alert{Automatically updated}. - \item The whole project is under \alert{version control} (Git) - \alert{encouraging tests/experimentation}. - \item The \alert{Git commit hash} of the project source, is - \alert{printed} in the paper and \alert{on output} data - products. - \item \colorbox{green!30!white}{These slides are available at - \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.} - \end{itemize} - - \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, - top=1pt, bottom=1pt] - For a technical description of Maneage's implementation, as well - as a checklist to customize it, and tips on good practices, - please see this page: - - \textcolor{blue}{\url{https://gitlab.com/maneage/project/-/blob/maneage/README-hacking.md}} - - \vspace{3mm} - \hfill \colorbox{green!30!white}{Feel free to contact me: \textcolor{blue}{\large{\url{mohammad@akhlaghi.org}}}} - \end{tcolorbox} - \end{frame} -\end{document} - - - - - % \begin{frame}{Funding to help adoption of template} % \begin{itemize} % \setlength\itemsep{5mm} -- cgit v1.2.1