diff options
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | img/SWHIDs.png | bin | 0 -> 178136 bytes | |||
-rw-r--r-- | img/aragon.png | bin | 0 -> 15788 bytes | |||
-rw-r--r-- | img/binary-blue.jpg | bin | 0 -> 258009 bytes | |||
-rw-r--r-- | img/cefca.png | bin | 0 -> 13462 bytes | |||
-rw-r--r-- | img/cpu-arch-astropy.png | bin | 506421 -> 240440 bytes | |||
-rw-r--r-- | img/cpu-arch-gnuastro.png | bin | 460828 -> 220961 bytes | |||
-rw-r--r-- | img/dependencies-conda-initial.png | bin | 0 -> 128310 bytes | |||
-rw-r--r-- | img/dependencies-jupyter.png | bin | 0 -> 188646 bytes | |||
-rw-r--r-- | img/digital-tunnel.jpg | bin | 107084 -> 0 bytes | |||
-rw-r--r-- | img/distros-astropy.pdf | bin | 40387 -> 52228 bytes | |||
-rw-r--r-- | img/distros-gnuastro.pdf | bin | 61510 -> 75718 bytes | |||
-rw-r--r-- | img/figure-branching.pdf | bin | 0 -> 158623 bytes | |||
-rw-r--r-- | img/filters-lsst.png | bin | 0 -> 41813 bytes | |||
-rw-r--r-- | img/jcava.jpg | bin | 0 -> 42107 bytes | |||
-rw-r--r-- | img/maneage-paper.png | bin | 0 -> 322232 bytes | |||
-rw-r--r-- | img/maneage-webpage.png | bin | 0 -> 111125 bytes | |||
-rw-r--r-- | img/minijpas-web.png | bin | 0 -> 1786685 bytes | |||
-rw-r--r-- | img/nature-astronomy.png | bin | 0 -> 36572 bytes | |||
-rw-r--r-- | img/nature-cartoon.jpg | bin | 118427 -> 0 bytes | |||
-rw-r--r-- | img/oaj.jpg | bin | 0 -> 43544 bytes | |||
-rw-r--r-- | img/project-flow-small.png | bin | 0 -> 25337 bytes | |||
-rw-r--r-- | img/reproducibility-crisis.jpg | bin | 624574 -> 0 bytes | |||
-rw-r--r-- | slides-intro-short.tex | 647 | ||||
-rw-r--r-- | slides-intro.tex | 389 | ||||
-rw-r--r-- | tex/git-branch.tex | 33 |
26 files changed, 687 insertions, 384 deletions
@@ -29,7 +29,7 @@ clean: clean-latex # PDF slides: -slides=slides-intro.pdf slides-intro-short.pdf +slides = slides-intro-short.pdf slides-intro.pdf $(slides): %.pdf: %.tex tex/*.tex # We'll run pdflatex two times so the page numbers and # internal links also work. diff --git a/img/SWHIDs.png b/img/SWHIDs.png Binary files differnew file mode 100644 index 0000000..7f1a705 --- /dev/null +++ b/img/SWHIDs.png diff --git a/img/aragon.png b/img/aragon.png Binary files differnew file mode 100644 index 0000000..26ebdc9 --- /dev/null +++ b/img/aragon.png diff --git a/img/binary-blue.jpg b/img/binary-blue.jpg Binary files differnew file mode 100644 index 0000000..3901e79 --- /dev/null +++ b/img/binary-blue.jpg diff --git a/img/cefca.png b/img/cefca.png Binary files differnew file mode 100644 index 0000000..a592fef --- /dev/null +++ b/img/cefca.png diff --git a/img/cpu-arch-astropy.png b/img/cpu-arch-astropy.png Binary files differindex 28f26fb..d13e4df 100644 --- a/img/cpu-arch-astropy.png +++ b/img/cpu-arch-astropy.png diff --git a/img/cpu-arch-gnuastro.png b/img/cpu-arch-gnuastro.png Binary files differindex f222380..adbdd2f 100644 --- a/img/cpu-arch-gnuastro.png +++ b/img/cpu-arch-gnuastro.png diff --git a/img/dependencies-conda-initial.png b/img/dependencies-conda-initial.png Binary files differnew file mode 100644 index 0000000..739260b --- /dev/null +++ b/img/dependencies-conda-initial.png diff --git a/img/dependencies-jupyter.png b/img/dependencies-jupyter.png Binary files differnew file mode 100644 index 0000000..19a4f9c --- /dev/null +++ b/img/dependencies-jupyter.png diff --git a/img/digital-tunnel.jpg b/img/digital-tunnel.jpg Binary files differdeleted file mode 100644 index c316553..0000000 --- a/img/digital-tunnel.jpg +++ /dev/null diff --git a/img/distros-astropy.pdf b/img/distros-astropy.pdf Binary files differindex 3cbea3f..db82659 100644 --- a/img/distros-astropy.pdf +++ b/img/distros-astropy.pdf diff --git a/img/distros-gnuastro.pdf b/img/distros-gnuastro.pdf Binary files differindex 2ada969..0c083fd 100644 --- a/img/distros-gnuastro.pdf +++ b/img/distros-gnuastro.pdf diff --git a/img/figure-branching.pdf b/img/figure-branching.pdf Binary files differnew file mode 100644 index 0000000..21d6450 --- /dev/null +++ b/img/figure-branching.pdf diff --git a/img/filters-lsst.png b/img/filters-lsst.png Binary files differnew file mode 100644 index 0000000..0376c7f --- /dev/null +++ b/img/filters-lsst.png diff --git a/img/jcava.jpg b/img/jcava.jpg Binary files differnew file mode 100644 index 0000000..fc4f215 --- /dev/null +++ b/img/jcava.jpg diff --git a/img/maneage-paper.png b/img/maneage-paper.png Binary files differnew file mode 100644 index 0000000..266133e --- /dev/null +++ b/img/maneage-paper.png diff --git a/img/maneage-webpage.png b/img/maneage-webpage.png Binary files differnew file mode 100644 index 0000000..a262fd4 --- /dev/null +++ b/img/maneage-webpage.png diff --git a/img/minijpas-web.png b/img/minijpas-web.png Binary files differnew file mode 100644 index 0000000..b391240 --- /dev/null +++ b/img/minijpas-web.png diff --git a/img/nature-astronomy.png b/img/nature-astronomy.png Binary files differnew file mode 100644 index 0000000..548afdf --- /dev/null +++ b/img/nature-astronomy.png diff --git a/img/nature-cartoon.jpg b/img/nature-cartoon.jpg Binary files differdeleted file mode 100644 index c4ab023..0000000 --- a/img/nature-cartoon.jpg +++ /dev/null diff --git a/img/oaj.jpg b/img/oaj.jpg Binary files differnew file mode 100644 index 0000000..a065eb4 --- /dev/null +++ b/img/oaj.jpg diff --git a/img/project-flow-small.png b/img/project-flow-small.png Binary files differnew file mode 100644 index 0000000..010a491 --- /dev/null +++ b/img/project-flow-small.png diff --git a/img/reproducibility-crisis.jpg b/img/reproducibility-crisis.jpg Binary files differdeleted file mode 100644 index 362576a..0000000 --- a/img/reproducibility-crisis.jpg +++ /dev/null diff --git a/slides-intro-short.tex b/slides-intro-short.tex index 3bc3f8f..3e0f8ff 100644 --- a/slides-intro-short.tex +++ b/slides-intro-short.tex @@ -1,6 +1,6 @@ % LaTeX source of slides on reproducible paper. % -% Copyright (C) 2020 Mohammad Akhlaghi <mohammad@akhlaghi.org> +% Copyright (C) 2020-2022 Mohammad Akhlaghi <mohammad@akhlaghi.org> % % This LaTeX source is free software: you can redistribute it and/or % modify it under the terms of the GNU General Public License as @@ -37,34 +37,32 @@ (\textcolor{blue}{\href{https://arxiv.org/abs/#1}{arXiv:#1}})}} %% Set the title -\title{Introducing Maneage:\\ - Customizable framework for managing data lineage\\ - \vspace{2mm}{\small [RDA Europe Adoption grant recipient. Submitted to \href{https://www.computer.org/csdl/magazine/cs}{IEEE CiSE} (\textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}), Comments welcome]} -} +\title{\huge\textbf{BIG} Data, \textbf{BIG} responsibility + \\\vspace{2mm} \large Maneage: Managing data lineage for long-term and archivable reproducibility \\\vspace{1mm} \footnotesize (Published in CiSE 23 (3), pp 82-91: \textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2021.3072860}{DOI:10.1109/MCSE.2021.3072860}}, \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}})} %% Set the author -\author{\vspace{8mm}\\ +\author{\\ \href{https://akhlaghi.org}{Mohammad Akhlaghi}\\\vspace{0.5mm} \footnotesize - Instituto de Astrof\'isica de Canarias ({\scriptsize IAC}), Tenerife, Spain + Centro de Estudios de F\'isica del Cosmos de Arag\'on ({\scriptsize CEFCA}), Teruel, Spain\vspace{8mm} } %% Set the date and insitutional logos. -\date{\footnotesize\vspace{0cm}\\ - \href{https://www.rd-alliance.org/rda-global-adoption-week-15-19-june-2020}{RDA Global Adoption week}\\June 18th, 2020\\ - \tiny\vspace{3mm} +\date{\footnotesize\vspace{-9mm}\\ + \textcolor{black}{Royal Observatory Coffee talk; Edinburgh}\\ + \textcolor{black}{23rd of May 2023} \\ + \tiny\vspace{10mm} Most recent slides available in link below (this PDF is built from \href{http://git.maneage.org/slides-intro.git}{Git commit} \gitcommit):\\ \footnotesize\textcolor{blue}{\url{https://maneage.org/pdf/slides-intro-short.pdf}}\\ - \vspace{2mm}\hspace{-0.25cm} - \raisebox{+0.4\height}{\includegraphics[width=2.5cm]{img/ministerio-ciencia.png}} - \raisebox{+0.3\height}{\includegraphics[width=1.3cm]{img/sundial.png}} + \vspace{2mm} + \raisebox{+0.2\height}{\includegraphics[width=3.5cm]{img/jcava.jpg}} + \includegraphics[width=9.5mm]{img/cefca.png} \includegraphics[width=1.2cm]{img/iac.png} - \includegraphics[width=1cm]{img/eu-sundial.png} \raisebox{0.13\height}{\includegraphics[width=1cm]{img/eu-regional.png}} \raisebox{0.05\height}{\includegraphics[width=1cm]{img/eu-rdaeu4.png}} \raisebox{+0.1\height}{\includegraphics[width=1.4cm]{img/rda-europe.png}} - \raisebox{+1.3\height}{\includegraphics[width=1.4cm]{img/ull.png}} - { }\raisebox{+0.5\height}{\includegraphics[width=2cm]{img/gobierno-canarias.png}}\\ + \raisebox{+1\height}{\includegraphics[width=1.5cm]{img/aragon.png}} + \raisebox{+0.8\height}{\includegraphics[width=1.6cm]{img/gobierno-canarias.png}}\\ \vspace{1cm} } @@ -86,7 +84,6 @@ - \begin{document} \begin{frame} @@ -95,34 +92,42 @@ \usebackgroundtemplate{ } %% undeclare it - \begin{frame}{Challenges of the RDA-WDS Publishing Data Workflows WG {\small (DOI:\href{https://doi.org/10.1007/s00799-016-0178-2}{10.1007/s00799-016-0178-2})}} - Challenges (also relevant to researchers, not just repositories) - \begin{itemize} - \item \emph{Bi-directional linking}: how to \alert{link data and publications}. - \item \emph{\alert{Software management}:} how to manage, preserve, publish and cite software? - \item \emph{Metrics:} \alert{how often} are data used. - \item \emph{Incentives to researchers:} how to \alert{communicate benefits} of following good practices \alert{to researchers}. - \end{itemize} + + + + %% Introduction to OAJ and J-PAS + \begin{frame}{Our main project: \textbf{J-PAS} with Observatorio Astrof\'isico de Javalambre (OAJ)} + J-PAS will observe the northern sky in \alert{56 medium-band filters} ($\sim14$nm): \begin{center} - \includegraphics[width=4cm]{img/rda.png}\hspace{1cm} - \includegraphics[width=4cm]{img/wds.jpg} + \includegraphics[width=0.9\linewidth]{img/oaj.jpg} \end{center} + \end{frame} - \ifdefined\longformat\pause\fi + \begin{frame}{LSST filter: 6 {\footnotesize(image from \href{https://speclite.readthedocs.io/en/latest/filters.html}{speclite docs})}:} + \begin{center} + \vspace{-3mm} + \includegraphics[width=0.8\linewidth]{img/filters-lsst.png} + \end{center} + \end{frame} - ``\emph{We would like to see a workflow that results in all - \textcolor{blue!30!green}{\bf scholarly objects being connected}, - linked, citable, and persistent to allow researchers to navigate - smoothly and to \alert{\bf enable reproducible research}. This - includes \alert{{\bf linkages} between documentation, code, data, and - journal articles in an integrated environment}. Furthermore, - in the ideal workflow, all of these objects need to be - \alert{\bf well documented} to enable other researchers (or - citizen scientists etc) to reuse the data for new - discoveries.}'' + \begin{frame}{J-PAS filters: 56 (Bonoli+2021: \href{https://ui.adsabs.harvard.edu/abs/2021A\%26A...653A..31B}{2021A\&A...653A..31B})} + \begin{center} + \includegraphics[width=\linewidth]{img/filters-jpas.pdf} + \end{center} \end{frame} + \begin{frame}{Result: photo-\alert{spectra} of \alert{every pixel} of the non-Galactic northern sky (like an IFU)!} + \url{http://archive.cefca.es/catalogues/minijpas-pdr201912/navigator.html} + + \vspace{3mm} + \includegraphics[width=\linewidth]{img/minijpas-web.png} + \end{frame} + + + + + \newcommand{\allopacity}{1} \ifdefined\longformat \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} @@ -165,23 +170,16 @@ \begin{frame}{Science is a tricky business} - \begin{center} - \includegraphics[width=0.9\linewidth]{img/nature-cartoon.jpg} - \end{center} - - \vspace{-0.3cm}\hfill - {\tiny Image from nature.com - (``\href{https://www.nature.com/articles/d41586-017-07522-z}{Five - ways to fix statistics}'', Nov 2017)}\hspace{7mm} - \vspace{-1mm} \begin{tcolorbox}[boxsep=0pt,left=1mm,right=1mm,top=1mm,bottom=1mm] - \small Data analysis [...] is a \alert{human + \large Data analysis [...] is a \alert{human behavior}. Researchers who hunt hard enough will turn up a result that fits statistical criteria, but their \alert{discovery} will probably be a \alert{false positive}. - \hfill Five ways to fix statistics, Nature, 551, Nov 2017. + \vspace{3mm} + \small + \hfill Five ways to fix statistics (Nature, 551, Nov 2017; DOI:\textcolor{blue}{\href{https://doi.org/10.1038/d41586-017-07522-z}{10.1038/d41586-017-07522-z}}). \end{tcolorbox} \end{frame} @@ -189,6 +187,303 @@ + \begin{frame}{``Reproducibility crisis'' in the sciences? (Baker 2016, Nature 533, 452, \textcolor{blue}{\href{https://doi.org/10.1038/533452a}{DOI:10.1038/533452a}})} + \Large + 1576 researchers participated in a survey by Nature, \alert{$90\%$} believed in a crisis! + + \vspace{7mm} + \begin{center} + \begin{tabular}{ |l|r| } + \hline + Status & $\%$ agreed \\ + \hline + \alert{Yes}, a significant crisis & \textcolor{red}{$52$} \\ + \alert{Yes}, a slight crisis & \textcolor{red}{$38$} \\ + Don't know & $7$ \\ + No, there is no crisis & $3$ \\ + \hline + \end{tabular} + \end{center} + + \vspace{7mm} + Full PDF available at \textcolor{blue}{\url{https://www.nature.com/articles/533452a.pdf}} + \end{frame} + + + + + + \begin{frame}{Notebooks are not long-term solutions {\small (see appendices of Akhlaghi+2021: \href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018})}} + \begin{columns} + \column{0.4\linewidth} + \includegraphics[width=\linewidth]{img/dependencies-conda-initial.png} + \column{0.4\linewidth} + \includegraphics[width=0.9\linewidth]{img/dependencies-jupyter.png} + \column{0.2\linewidth} + Results from run on May 10th, 2022: + + \pause + \vspace{7mm} + Conda setup:\\\alert{39 dependencies} + + \pause + \vspace{7mm} + Jupyter (with Pip):\\\alert{61 dependencies} + + \pause + \vspace{7mm} + Web browser has more dependencies; with fluid/\alert{evolving} web technologies. + + \pause + \vspace{7mm} + They can contain \alert{binary} components. + \end{columns} + \end{frame} + + + + + + \begin{frame}{The dependency tree (Matplotlib is \emph{only one} dependency of Jupyter)} + \Wider[5em]{ + %\vspace{5mm} + \begin{center} + \includegraphics[width=0.9\linewidth]{img/matplotlib.png} + \end{center} + + \vspace{3mm}\tiny From ``Attributing and Referencing (Research) + Software: Best Practices and Outlook from Inria'' (Alliez et + al. 2020, CiSE, DOI:\textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2019.2949413}{10.1109/MCSE.2019.2949413}}). + } + \end{frame} + + + + + \begin{frame}{Are containers the solution? Yes, but ... for the short term} + \pause + \begin{itemize} + \setlength\itemsep{5mm} + \item Containers are \alert{large} (many giga-bytes) + \begin{itemize} + \setlength\itemsep{3mm} + \pause + \item \alert{Expensive} to archive! + \pause + \item Example: \textcolor{blue}{\href{https://is.ieis.tue.nl/staff/pvgorp/share}{SHARE}} (enabling remote connection to Virtual machines with project environment): + \begin{itemize} + \setlength\itemsep{2mm} + \item \alert{2nd place} in Elsevier's ``Executable paper grand challenge'' of 2011. + \item SHARE's image repository was taken offline in 2019! + \item Even the challenge webpage is no longer available: \textcolor{blue}{\href{http://www.executablepapers.com}{http://www.executablepapers.com}} + \end{itemize} + \end{itemize} + \pause + \item Container are \alert{binary} (tailored to certain kernels+CPUs) + \begin{itemize} + \setlength\itemsep{3mm} + \pause + \item Only guarantee the Long Term Release kernels. + \begin{itemize} + \setlength\itemsep{2mm} + \item Become un-readable, multi-gigabyte binary blobs in $\sim10$ years! + \item Even if you store them on Zenodo! + \end{itemize} + \pause + \item Only on common CPUs architectures. + \end{itemize} + \pause + \item Containers \alert{themselves} are \alert{hard to reproduce}. + \begin{itemize} + \item Example: \textcolor{blue}{\href{https://ui.adsabs.harvard.edu/abs/2020CSE....22a.102M}{2020CSE....22a.102M}} use `\texttt{FROM ubuntu:16.04}', but if run today, \textcolor{blue}{\href{https://partner-images.canonical.com/core/xenial}{images are from 2021}}. + \end{itemize} + \end{itemize} + \end{frame} + + + + + + \begin{frame} + \Large For \alert{longevity issues} with Jupyter, Conda, Containers and etc ... + + \vspace{3mm} + As well as a survey of \alert{depreciated}/\alert{abandoned}/\alert{lost} solutions since the \alert{1990s} ... + + \vspace{5mm} + \hfill ... see the appendices in \textcolor{blue}{\href{https://arxiv.org/pdf/2006.03018.pdf}{arXiv:2006.03018}} + \end{frame} + + + + + + \begin{frame}{Our solution: CiSE 23 (3), pp 82-91: \textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2021.3072860}{DOI:10.1109/MCSE.2021.3072860}}, \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}} + \begin{columns} + \column{0.4\linewidth} + \includegraphics[width=\linewidth]{img/maneage-paper.png} + \column{0.6\linewidth} + \includegraphics[width=\linewidth]{img/maneage-webpage.png} + \begin{center} + \huge{https://maneage.org} + \end{center} + \end{columns} + \end{frame} + + + + + + \begin{frame}{Recognition 1: RDA adoption grant (2019) to IAC for Maneage} + \begin{center} + \includegraphics[width=3cm]{img/rda.png}\hspace{1cm} + \includegraphics[width=1.8cm]{img/iac.png} + \includegraphics[width=\linewidth]{img/h2020.jpg} + \end{center} + + \vspace{1cm} For Maneage, the \alert{IAC} is selected as + a \alert{Top European organization} funded to adopt RDA + Recommendations and Outputs. + + \vspace{1cm} + \scriptsize + \begin{itemize} + \item Research Data Alliance was launched by the \alert{European + Commission}, NSF, National Institute of Standards and + Technology, and the Australian Government’s Department of + Innovation. + \item RDA Outputs are the technical and social infrastructure + solutions developed by RDA Working Groups or Interest + Groups that enable data sharing, exchange, and + interoperability. + \end{itemize} + + \vspace{0.2cm} + \end{frame} + + + + + + \begin{frame}{Recognition 2: ``News and Views'' in Nature Astronomy (\textcolor{blue}{\href{https://doi.org/10.1038/s41550-021-01402-3}{DOI:10.1038/s41550-021-01402-3}})} + \begin{center} + \includegraphics[width=0.8\linewidth]{img/nature-astronomy.png} + \end{center} + + \vspace{-2mm} + \footnotesize Free-to-read link: \textcolor{blue}{\url{https://rdcu.be/cmYVx}} + \end{frame} + + + + + + \begin{frame}[t]{Definitions \& Clarification \hspace{1.6cm} {\normalsize(from the National Academies report in 2019, \href{http://doi.org/10.17226/25303}{DOI:10.17226/25303})}} + \vspace{-5mm} + \begin{columns}[t] + \column{0.5\linewidth} + \begin{center} + \large\textbf{Replicability (hardware/statistical)} + \rule{0.5\linewidth}{1pt} + \end{center} + \begin{itemize} + \setlength\itemsep{0.5em} + \item Involves data \alert{collection}. + \item Inherently includes \alert{measurements errors}\\(can + never be exactly reproduced). + \item Example: Raw telescope image/spectra. + \item \alert{\textbf{NOT DISCUSSED HERE.}} + \end{itemize} + + \vspace{3.5mm} + \begin{center} + \vspace{-5mm} + \includegraphics[width=0.7\linewidth]{img/hale-prime-focus.jpg}\\ + \vspace{-0.6mm} + \tiny \href{http://slittlefair.staff.shef.ac.uk/teaching/phy217/lectures/telescopes/L07/index.html}{http://slittlefair.staff.shef.ac.uk} + \end{center} + + \column{0.5\linewidth} + \end{columns} + \end{frame} + \begin{frame}[t]{Definitions \& Clarification \hspace{1.6cm} {\normalsize(from the National Academies report in 2019, \href{http://doi.org/10.17226/25303}{DOI:10.17226/25303})}} + \vspace{-5mm} + \begin{columns}[t] + \column{0.5\linewidth} + \begin{center} + \large\textbf{Replicability (hardware/statistical)} + \rule{0.5\linewidth}{1pt} + \end{center} + \begin{itemize} + \setlength\itemsep{0.5em} + \item Involves data \alert{collection}. + \item Inherently includes \alert{measurements errors}\\(can + never be exactly reproduced). + \item Example: Raw telescope image/spectra. + \item \alert{\textbf{NOT DISCUSSED HERE.}} + \end{itemize} + + \vspace{3.5mm} + \begin{center} + \vspace{-5mm} + \includegraphics[width=0.7\linewidth]{img/hale-prime-focus-marked.jpg}\\ + \vspace{-0.6mm} + \tiny \href{http://slittlefair.staff.shef.ac.uk/teaching/phy217/lectures/telescopes/L07/index.html}{http://slittlefair.staff.shef.ac.uk} + \end{center} + + \column{0.5\linewidth} + \end{columns} + \end{frame} + \begin{frame}[t]{Definitions \& Clarification \hspace{1.6cm} {\normalsize(from the National Academies report in 2019, \href{http://doi.org/10.17226/25303}{DOI:10.17226/25303})}} + \vspace{-5mm} + \begin{columns}[t] + \column{0.5\linewidth} + \begin{center} + \large\textbf{Replicability (hardware/statistical)} + \rule{0.5\linewidth}{1pt} + \end{center} + \begin{itemize} + \setlength\itemsep{0.5em} + \item Involves data \alert{collection}. + \item Inherently includes \alert{measurements errors}\\(can + never be exactly reproduced). + \item Example: Raw telescope image/spectra. + \item \alert{\textbf{NOT DISCUSSED HERE.}} + \end{itemize} + + \vspace{3.5mm} + \begin{center} + \vspace{-5mm} + \includegraphics[width=0.7\linewidth]{img/hale-prime-focus.jpg}\\ + \vspace{-0.6mm} + \tiny \href{http://slittlefair.staff.shef.ac.uk/teaching/phy217/lectures/telescopes/L07/index.html}{http://slittlefair.staff.shef.ac.uk} + \end{center} + + \column{0.5\linewidth} + \begin{center} + \large\textbf{Reproducibility (Software/Deterministic)} + \rule{0.5\linewidth}{1pt} + \end{center} + \begin{itemize} + \setlength\itemsep{1em} + \item Involves data \alert{analysis}, or simulations. + \item Starts \alert{after} data is collected/digitized. + \item Example: $2+2=4$ (i.e., sum of datasets). + \item \textbf{\textcolor{green!50!black}{DISCUSSED HERE.}} + \end{itemize} + + \centering + \vspace{3mm} + \includegraphics[width=0.88\linewidth]{img/binary-blue.jpg}\\ + \vspace{-0.6mm} + \tiny \href{https://commons.wikimedia.org/wiki/File:Binary_blue.jpg}{Wikimedia Commons} + \end{columns} + \end{frame} + + + + + \begin{frame}{Founding criteria} \begin{tcolorbox}[title=Basic/simple principle:] \centering Science is defined by its METHOD, \alert{not} its @@ -237,7 +532,7 @@ - \ifdefined\longformat + \begin{frame}{Predefined/exact software tools} \small \begin{columns} @@ -251,28 +546,43 @@ reproducibility. \end{tcolorbox} - \vspace{2cm} - \begin{itemize} - \setlength\itemsep{0.6cm} + \setlength\itemsep{2mm} \item \emph{Containers} or \emph{Virtual Machines} are a \alert{binary black box}. + \begin{itemize} + \setlength\itemsep{2mm} + \item e.g., with `\texttt{FROM ubuntu:16.04}' (released in April 2016), + \item in a \texttt{Dockerfile}, the OS image will come from (updated monthly!): \url{https://partner-images.canonical.com/core/xenial} + \end{itemize} \item Maneage \alert{installs fixed versions} of all - necessary research software and their dependencies. + necessary research software. + \begin{itemize} + \item Including their dependencies. + \item All the way down to the C compiler. + \end{itemize} \item Installs similar environment on \alert{GNU/Linux}, or \alert{macOS} systems. - \item Works very much like a package manager (e.g., - \alert{\texttt{apt}} or \alert{\texttt{brew}}). + \item Works like a package manager (e.g., + \alert{\texttt{apt}}, \alert{\texttt{brew}} or Conda). + \begin{itemize} + \item ... \alert{but (!)}, its not a third party package manager. + \item Build instructions are within same analysis project. + \item e.g., see Conda's build of Gnuastro (its gets updated behind your back): \url{https://anaconda.org/conda-forge/gnuastro/files} + \end{itemize} + + \item Source code of all software in Maneage is archived on + \textcolor{blue}{\href{https://doi.org/10.5281/zenodo.3883409}{zenodo.3883409}}. \end{itemize} \column{5cm} \includegraphics[width=\linewidth]{img/version.png} \end{columns} \end{frame} - \fi + @@ -290,21 +600,36 @@ reproducibility. \end{tcolorbox} - \vspace{2cm} - \begin{itemize} - \setlength\itemsep{0.6cm} + \setlength\itemsep{2mm} \item \emph{Containers} or \emph{Virtual Machines} are a \alert{binary black box}. + \begin{itemize} + \setlength\itemsep{2mm} + \item e.g., with `\texttt{FROM ubuntu:16.04}' (released in April 2016), + \item in a \texttt{Dockerfile}, the OS image will come from (updated monthly!): \url{https://partner-images.canonical.com/core/xenial} + \end{itemize} \item Maneage \alert{installs fixed versions} of all - necessary research software and their dependencies. + necessary research software. + \begin{itemize} + \item Including their dependencies. + \item All the way down to the C compiler. + \end{itemize} \item Installs similar environment on \alert{GNU/Linux}, or \alert{macOS} systems. - \item Works very much like a package manager (e.g., - \alert{\texttt{apt}} or \alert{\texttt{brew}}). + \item Works like a package manager (e.g., + \alert{\texttt{apt}}, \alert{\texttt{brew}} or Conda). + \begin{itemize} + \item ... \alert{but (!)}, its not a third party package manager. + \item Build instructions are within same analysis project. + \item e.g., see Conda's build of Gnuastro (its gets updated behind your back): \url{https://anaconda.org/conda-forge/gnuastro/files} + \end{itemize} + + \item Source code of all software in Maneage is archived on + \textcolor{blue}{\href{https://doi.org/10.5281/zenodo.3883409}{zenodo.3883409}}. \end{itemize} \column{5cm} @@ -316,25 +641,6 @@ - \begin{frame}{Example: Matplotlib (a Python visualization library) build dependencies} - \Wider[5em]{ - %\vspace{5mm} - \begin{center} - \includegraphics[width=0.9\linewidth]{img/matplotlib.png} - \end{center} - - \vspace{3mm}\tiny From ``Attributing and Referencing (Research) - Software: Best Practices and Outlook from Inria'' (Alliez et - al. 2020, CiSE, DOI:\textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2019.2949413}{10.1109/MCSE.2019.2949413}}). - } - \end{frame} - - - - - - - \begin{frame}{Advantages of this build system} \begin{columns} \column{11cm} @@ -714,7 +1020,15 @@ {It is very easy to expand the project and add new analysis steps (this solution is scalable)} + \begin{frame}{Files organized in directories by context (here are some of the files discussed before)} + \centering + \includegraphics[width=0.85\linewidth]{img/figure-file-architecture-1.pdf} + \end{frame} + \begin{frame}{Files organized in directories by context (now with other project files and symbolic links)} + \centering + \includegraphics[width=0.85\linewidth]{img/figure-file-architecture-2.pdf} + \end{frame} @@ -752,6 +1066,10 @@ \ifdefined\longformat \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} + \newcommand{\tomorrow}{1} + \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} + \newcommand{\abstractify}{1} + \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \newcommand{\projinit}{} \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \newcommand{\projwork}{} @@ -766,6 +1084,7 @@ \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \newcommand{\gitverified}{} \else + \newcommand{\abstractify}{1} \newcommand{\projinit}{} \newcommand{\projwork}{} \newcommand{\tempevolve}{} @@ -818,15 +1137,22 @@ \alert{negligible} compared to a single figure in a paper (usually $\sim100$ kilo-bytes). - \vspace{1cm} The project's pipeline (customized Maneage) can be + \vspace{6mm} The project's pipeline (customized Maneage) can be \alert{published} in \begin{itemize} \item \alert{arXiv}: uploaded with the \LaTeX{} source to always stay with the paper \\(for example - \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}} or \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}). + \textcolor{blue}{\href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230}}, + \textcolor{blue}{\href{https://arxiv.org/abs/1911.01430}{arXiv:1911.01430}}, + \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}, + \textcolor{blue}{\href{https://arxiv.org/abs/2007.11779}{arXiv:2007.11779}}\\ + \textcolor{blue}{\href{https://arxiv.org/abs/2010.03742}{arXiv:2010.03742}}, + \textcolor{blue}{\href{https://arxiv.org/abs/2112.14174}{arXiv:2112.14174}}). \item \alert{Zenodo}: Along with all the input datasets (many Gigabytes) and software \\(for example - \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.3872248}{zenodo.3872248}}) and given a unique DOI. + \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.6533902}{zenodo.6533902}}, also see comments in arXiv links above) and given a unique DOI. + \item \alert{Software Heritage}: to archive the full version-controlled history of the project.\\(for example + {\small \textcolor{blue}{\href{https://archive.softwareheritage.org/swh:1:dir:33fea87068c1612daf011f161b97787b9a0df39f;origin=http://git.maneage.org/paper-concept.git/;visit=swh:1:snp:89af43c4b076a17d9298299f224247038af355ea;anchor=swh:1:rev:313db0b04bd3499f83d9e79fd7e92578cd367c2b}{swh:1:dir:33fea87068c1612daf011f161b97787b9a0df39fk}}}) \end{itemize} \end{frame} @@ -834,7 +1160,20 @@ - \begin{frame}[t]{General outline of using Maneage (for example \href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018})} + \begin{frame}{Software Heritage IDs (SWHID); persistent identifier for source code (or any text!)} + + \vspace{5mm} + \includegraphics[width=\linewidth]{img/SWHIDs.png} + \vspace{5mm} + + {\hfill\small For more details, see SoftwareHeritage FAQ (at \textcolor{blue}{\url{https://www.softwareheritage.org/faq}}}) + \end{frame} + + + + + + \begin{frame}[t]{Executing a Maneaged project (for example \href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018})} \vspace{1cm} \texttt{\$ git clone https://gitlab.com/makhlaghi/maneage-paper{ }{ }{ }{ }\textcolor{gray}{\# Import the project.}}\\ @@ -850,21 +1189,6 @@ - \ifdefined\longformat - \begin{frame}{Project source and its execution} - \begin{tcolorbox} - Programs \textcolor{gray}{[here: Scientific projects]} must be - written for \alert{people to read}... - - \hfill ...and only \emph{incidentally} for machines to - \emph{execute}. - - \vspace{2mm} - \hfill \footnotesize Harold Abelson, Structure and Interpretation of Computer Programs - \end{tcolorbox} - \end{frame} - \fi - \begin{frame}{Future prospects...} @@ -905,115 +1229,34 @@ - \begin{frame}{Summary:} - Maneage and its principles are described in \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}. - It is a customizable template that will do the following steps/instructions (all in simple plain text files). + + + + + \begin{frame}{Summary:} + Maneage (\textcolor{blue}{\url{https://maneage.org}}) is a customizable template that will for research or data reduction: \begin{itemize} - \item \alert{Automatically downloads} the necessary - \emph{software} and \emph{data}. - \item \alert{Builds} the software in a \alert{closed - environment}. - \item Runs the software on data to \alert{generate} the final - \alert{research results}. - \item Modification of part of the analysis will only - result in re-doing that part, not the whole project. - \item Using LaTeX macros, paper's figures, tables and numbers - will be \alert{Automatically updated} after a change in - analysis. Allowing the scientist to focus on the scientific - interpretation. - \item The whole project is under \alert{version control} (Git) - to allow easy reversion to a previous state. This - \alert{encourages tests/experimentation} in the analysis. - \item The \alert{Git commit hash} of the project source, is - \alert{printed} in the published paper and \alert{saved on - output} data products. Ensuring the - integrity/reproducibility of the result. - \item \colorbox{green!30!white}{These slides are available at - \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro-short.pdf}}.} - \item \colorbox{green!15!white}{Longer slides are available at - \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.} + \item \alert{Automatically downloads} the necessary \emph{software} and \emph{data}. + \item \alert{Builds} the software in a \alert{closed environment}. + \item Runs the software on data to \alert{generate} the final \alert{research results}. + \item Modification of part of the analysis will only result in re-doing that part, not the whole project. + \item Using LaTeX macros, paper's figures, tables and numbers will be \alert{Automatically updated}. + \item The whole project is under \alert{version control} (Git) to allow easy reversion to a previous state. This \alert{encourages tests/experimentation} in the analysis. + \item The \alert{Git commit hash} of the project source, is \alert{printed} in the published paper and \alert{saved on output} data products. Ensuring the integrity/reproducibility of the result. + \item \colorbox{green!30!white}{These slides are available at \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro-short.pdf}}.} + \item \colorbox{green!15!white}{Longer slides are available at \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.} + \begin{itemize} + \item YouTube recording (May 2021): \textcolor{blue}{\url{https://www.youtube.com/watch?v=XdhRUhoMqw0}} + \end{itemize} + \item \colorbox{purple!20!white}{\small Matrix-protocol chat room: \texttt{\#maneage-general:matrix.org}} \end{itemize} \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, top=1pt, bottom=1pt] - For a technical description of Maneage's implementation, as well - as a checklist to customize it, and tips on good practices, - please see this page: + For a technical description of Maneage's implementation, as well as a checklist to customize it, and tips on good practices, please see this page: \textcolor{blue}{\footnotesize\url{https://gitlab.com/maneage/project/-/blob/maneage/README-hacking.md}} \end{tcolorbox} \end{frame} - - -\ifdefined\longformat -\begin{frame}{Existing technologies (Independent environment)} - \begin{itemize} - \setlength\itemsep{7mm} - \item \textbf{Virtual machines:} - \begin{itemize} - \setlength\itemsep{3mm} - \item Contain the \alert{full operating system}, are thus very large ($\times$Gigabytes). - \item In \emph{binary} format (decoding a built VM's environment is extremely hard and inaccurate). - \end{itemize} - \item \textbf{Containers:} (For example Docker or Singularity) - \begin{itemize} - \setlength\itemsep{3mm} - \item Similar to virtual machines, but \alert{without low-level kernel} (use host's kernel). - \item \alert{Will fail} as soon as kernel is no longer supported\\(for example Docker currently only supports Linux kernel 3.10 and above \alert{from 2013}). - \item Good solutions for software engineers (that need to \emph{reproduce a bug's environment today}). - \item Docker is modular, needs root previlages (not available in HPCs), Dockerfiles allow incompleteness\\(especially in the common scenario of using the operating system's package manager, see next slide) - \item Singularity is monolithic and thus can be very large. - \item In \alert{binary} format (similar to VMs, especially when OS package managers are used). - \end{itemize} - \end{itemize} - - \vspace{3mm} -In summary, they only \alert{store a built} environment (they are outputs, not good for archiving). -\end{frame} - - - - -\begin{frame}{Existing technologies (Package managers)} - - \begin{itemize} - \item \textbf{Operating system package managers:} - \begin{itemize} - \setlength\itemsep{2mm} - \item For example \texttt{apt} or \texttt{yum} for Debian-based and RedHat-based GNU/Linux operating systems\\(the most common way to install software). - \item Tightly intertwined with the operating system's components\\(arbitrary control of software versions is not easily possible). - \item Older software (for example +5 years) is usually removed. - \end{itemize} - \item \textbf{Conda/Anaconda:} - \begin{itemize} - \setlength\itemsep{2mm} - \item Conda has build instructions for software and their dependencies. - \item But it doesn't go down to the C library or the lower-level components of operating system. - \item It is written in Python (can't be used later when current Python is depreciated). - \item Authors of Uhse+2019\footnote{\url{http://dx.doi.org/10.1002/cppb.20097}} report\footnote{\url{https://github.com/conda-forge/conda-forge.github.io/issues/787}} that their Conda environment breaks roughly every 3 months\\(Conda environments need to be updated to be used later! Breaking reproducibility). - \end{itemize} - \item \textbf{Nix, or GNU Guix:} - \begin{itemize} - \setlength\itemsep{2mm} - \item Deliver perfectly reproducible builds (bit-wise reproducibility of software), needs root access. - \item Doesn't \emph{require} documentation of dependencies. - \end{itemize} - \item \textbf{Spack:} Similar to Nix/Guix but written in Python. - \end{itemize} -\end{frame} - -\begin{frame}{Existing technologies (workflow tools)} - \begin{itemize} - \setlength\itemsep{4mm} - \item \textbf{Binder:} (\url{https://mybinder.org}) Docker+Conda. - \item \textbf{Galaxy:} (\url{https://galaxyproject.org}) A web-based user interface, primarily designed for genomics. The GUI make it hard to automate, and has too many dependencies. Very similar to GenePattern (2008 to 2017): with +40,000 users and $\sim4000$ jobs running per week, but cut due to funding. - \item \textbf{Sciunit:} (\url{https://sciunit.run}) Parses program binaries to try to infer their dependencies and copy them. - \item \textbf{Popper:} (\url{https://falsifiable.us}), HCL (previously used by GitHub Actions) + Conda + Docker. - \item \textbf{WholeTale:} (\url{https://wholetale.org}) Jupyter + Conda + Docker. - \item \textbf{Image Processing On Line (IPOL) journal:} The best example of publishing algorithms/methods I have seen, only useful for very basic/low-level software. - \end{itemize} - \alert{Summary}: except for IPOL, most solutions surveyed have far too many dependencies to be usable \alert{beyond the immediate future}. -\end{frame} -\fi \end{document} diff --git a/slides-intro.tex b/slides-intro.tex index 3ffedf9..c395cdc 100644 --- a/slides-intro.tex +++ b/slides-intro.tex @@ -35,31 +35,32 @@ %% Set the title \title{\huge\textbf{BIG} Data, \textbf{BIG} responsibility - \\ {\normalsize Introducing \emph{Maneage}: customizable framework for \emph{man}aging data lin\emph{eage}}} + \\\vspace{2mm} \large Maneage: \emph{Man}aging data lin\emph{eage} for long-term and archivable reproducibility \\\vspace{1mm} \footnotesize (Published in CiSE 23 (3), pp 82-91: \textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2021.3072860}{DOI:10.1109/MCSE.2021.3072860}}, \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}})} %% Set the author -\author{\vspace{1cm}\\ +\author{\\ \href{https://akhlaghi.org}{Mohammad Akhlaghi}\\\vspace{0.5mm} \footnotesize - Instituto de Astrof\'isica de Canarias ({\scriptsize IAC}), Tenerife, Spain + Centro de Estudios de F\'isica del Cosmos de Arag\'on ({\scriptsize CEFCA}), Teruel, Spain\vspace{8mm} } %% Set the date and insitutional logos. -\date{\footnotesize\vspace{0cm}\\ - \textcolor{white}{PLACE HOLDER}\\\textcolor{white}{MONTH DAY, YEAR} \\ - \tiny\vspace{3mm} +\date{\footnotesize\vspace{-5mm}\\ + \textcolor{black}{Royal Observatory Coffee talk; Edinburgh}\\ + \textcolor{black}{23rd of May 2023} \\ + \tiny\vspace{9mm} Most recent slides available in link below (this PDF is built from \href{http://git.maneage.org/slides-intro.git}{Git commit} \gitcommit):\\ \footnotesize\textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}\\ - \vspace{2mm}\hspace{-0.25cm} - \raisebox{+0.4\height}{\includegraphics[width=2.5cm]{img/ministerio-ciencia.png}} - \raisebox{+0.3\height}{\includegraphics[width=1.3cm]{img/sundial.png}} + \vspace{2mm} + \raisebox{+0.4\height}{\includegraphics[width=3cm]{img/jcava.jpg}} + \includegraphics[width=1cm]{img/cefca.png} \includegraphics[width=1.2cm]{img/iac.png} \includegraphics[width=1cm]{img/eu-sundial.png} \raisebox{0.13\height}{\includegraphics[width=1cm]{img/eu-regional.png}} \raisebox{0.05\height}{\includegraphics[width=1cm]{img/eu-rdaeu4.png}} \raisebox{+0.1\height}{\includegraphics[width=1.4cm]{img/rda-europe.png}} - \raisebox{+1.3\height}{\includegraphics[width=1.4cm]{img/ull.png}} - { }\raisebox{+0.5\height}{\includegraphics[width=2cm]{img/gobierno-canarias.png}}\\ + \raisebox{+1\height}{\includegraphics[width=1.5cm]{img/aragon.png}} + \raisebox{+0.8\height}{\includegraphics[width=1.6cm]{img/gobierno-canarias.png}}\\ \vspace{1cm} } @@ -88,10 +89,6 @@ \titlepage \end{frame} - - - - \begin{frame}{Let's start with this nice image of the Wirlpool galaxy (M51): \small{\url{https://i.redd.it/jfqgpqg0hfk11.jpg}}} \begin{center} \includegraphics[width=0.8\linewidth]{img/m51-amateur.jpg} @@ -231,9 +228,88 @@ for computational reproducibility] \end{frame} - \begin{frame}{``Reproducibility crisis'' in the sciences? (Baker 2016, Nature 533, 452)} - \centering - \includegraphics[width=0.85\linewidth]{img/reproducibility-crisis.jpg} + + + + \begin{frame}{``Reproducibility crisis'' in the sciences? (Baker 2016, Nature 533, 452, \textcolor{blue}{\href{https://doi.org/10.1038/533452a}{DOI:10.1038/533452a}})} + \Large + 1576 researchers participated in a survey by Nature, \alert{$90\%$} believed in a crisis! + + \vspace{7mm} + \begin{center} + \begin{tabular}{ |l|r| } + \hline + Status & $\%$ agreed \\ + \hline + \alert{Yes}, a significant crisis & \textcolor{red}{$52$} \\ + \alert{Yes}, a slight crisis & \textcolor{red}{$38$} \\ + Don't know & $7$ \\ + No, there is no crisis & $3$ \\ + \hline + \end{tabular} + \end{center} + + \vspace{7mm} + Full PDF available at \textcolor{blue}{\url{https://www.nature.com/articles/533452a.pdf}} + \end{frame} + + + + + + \begin{frame}{Our solution: CiSE 23 (3), pp 82-91: \textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2021.3072860}{DOI:10.1109/MCSE.2021.3072860}}, \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}} + \begin{columns} + \column{0.4\linewidth} + \includegraphics[width=\linewidth]{img/maneage-paper.png} + \column{0.6\linewidth} + \includegraphics[width=\linewidth]{img/maneage-webpage.png} + \begin{center} + \huge{https://maneage.org} + \end{center} + \end{columns} + \end{frame} + + + + \begin{frame}{Recognition 1: RDA adoption grant (2019) to IAC for Maneage} + \begin{center} + \includegraphics[width=3cm]{img/rda.png}\hspace{1cm} + \includegraphics[width=1.8cm]{img/iac.png} + \includegraphics[width=\linewidth]{img/h2020.jpg} + \end{center} + + \vspace{1cm} For Maneage, the \alert{IAC} is selected as + a \alert{Top European organization} funded to adopt RDA + Recommendations and Outputs. + + \vspace{1cm} + \scriptsize + \begin{itemize} + \item Research Data Alliance was launched by the \alert{European + Commission}, NSF, National Institute of Standards and + Technology, and the Australian Government’s Department of + Innovation. + \item RDA Outputs are the technical and social infrastructure + solutions developed by RDA Working Groups or Interest + Groups that enable data sharing, exchange, and + interoperability. + \end{itemize} + + \vspace{0.2cm} + \end{frame} + + + + + + \begin{frame}{Recognition 2: ``News and Views'' in Nature Astronomy (\textcolor{blue}{\href{https://ui.adsabs.harvard.edu/abs/2021NatAs...5..986K}{2021NatAs...5..986K}})} + \begin{center} + \includegraphics[width=\linewidth]{img/nature-astronomy.png} + \end{center} + + \vspace{1cm} + Free-to-read link: \textcolor{blue}{\url{https://rdcu.be/cmYVx}}\\ + DOI: \textcolor{blue}{\href{https://doi.org/10.1038/s41550-021-01402-3}{10.1038/s41550-021-01402-3}} \end{frame} @@ -290,7 +366,7 @@ for computational reproducibility] \vspace{-5mm} \includegraphics[width=0.7\linewidth]{img/hale-prime-focus-marked.jpg}\\ \vspace{-0.6mm} - \tiny \href{http://slittlefair.staff.shef.ac.uk/teaching/phy217/lectures/telescopes/L07/index.html}{http://slittlefair.staff.shef.ac.uk} + \tiny \href{https://slittlefair.staff.shef.ac.uk/teaching/phy217/lectures/telescopes/l07}{http://slittlefair.staff.shef.ac.uk} \end{center} \column{0.5\linewidth} @@ -318,7 +394,7 @@ for computational reproducibility] \vspace{-5mm} \includegraphics[width=0.7\linewidth]{img/hale-prime-focus.jpg}\\ \vspace{-0.6mm} - \tiny \href{http://slittlefair.staff.shef.ac.uk/teaching/phy217/lectures/telescopes/L07/index.html}{http://slittlefair.staff.shef.ac.uk} + \tiny \href{https://slittlefair.staff.shef.ac.uk/teaching/phy217/lectures/telescopes/l07}{http://slittlefair.staff.shef.ac.uk} \end{center} \column{0.5\linewidth} @@ -336,9 +412,9 @@ for computational reproducibility] \centering \vspace{3mm} - \includegraphics[width=0.8\linewidth]{img/digital-tunnel.jpg}\\ + \includegraphics[width=0.88\linewidth]{img/binary-blue.jpg}\\ \vspace{-0.6mm} - \tiny \href{https://tsongas.com/newsletter_articles/the-new-electronic-version-of-the-advantage/digital-tunnel-wallpaper/}{https://tsongas.com} + \tiny \href{https://commons.wikimedia.org/wiki/File:Binary_blue.jpg}{Wikimedia Commons} \end{columns} \end{frame} @@ -354,14 +430,15 @@ for computational reproducibility] \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} \newcommand{\sver}{} \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} - \begin{frame}{Different package managers have different versions of software (repology.org, 2019/11/20)} + \begin{frame}[t]{Different package managers have different versions of software (repology.org, 2021/12/02)} \begin{columns} \column{7cm} \center Astropy\\ - \includegraphics[width=2.2cm]{img/distros-astropy.pdf} + \includegraphics[width=2.8cm]{img/distros-astropy.pdf} \column{7cm} \center GNU Astronomy Utilities (Gnuastro)\\ - \includegraphics[width=2.7cm]{img/distros-gnuastro.pdf} + \includegraphics[trim={0 8cm 0 0}, clip, width=2.9cm]{img/distros-gnuastro.pdf} + \includegraphics[trim={0 0 0 16cm}, clip, width=2.9cm]{img/distros-gnuastro.pdf} \end{columns} \end{frame} \newcommand{\srep}{} @@ -386,13 +463,13 @@ for computational reproducibility] al. 2020, CiSE, DOI:\textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2019.2949413}{10.1109/MCSE.2019.2949413}}). } \end{frame} - \begin{frame}{Impact of ``Dependency hell'' on native building in various hardware (CPU architectures)} + \begin{frame}{Impact of ``Dependency hell'' on native building in various hardware (CPU architectures), retrieved from Debian on 2021/12/02} \begin{columns} \column{7cm} - \includegraphics[width=0.9\linewidth]{img/cpu-arch-astropy.png} + \includegraphics[width=\linewidth]{img/cpu-arch-astropy.png} Astropy depends on Matplotlib \column{6cm} - \includegraphics[width=0.9\linewidth]{img/cpu-arch-gnuastro.png} + \includegraphics[width=1.05\linewidth]{img/cpu-arch-gnuastro.png} GNU Astronomy Utilities doesn't. \end{columns} \end{frame} @@ -444,23 +521,16 @@ for computational reproducibility] \begin{frame}{Science is a tricky business} - \begin{center} - \includegraphics[width=0.9\linewidth]{img/nature-cartoon.jpg} - \end{center} - - \vspace{-0.3cm}\hfill - {\tiny Image from nature.com - (``\href{https://www.nature.com/articles/d41586-017-07522-z}{Five - ways to fix statistics}'', Nov 2017)}\hspace{7mm} - \vspace{-1mm} \begin{tcolorbox}[boxsep=0pt,left=1mm,right=1mm,top=1mm,bottom=1mm] - \small Data analysis [...] is a \alert{human + \large Data analysis [...] is a \alert{human behavior}. Researchers who hunt hard enough will turn up a result that fits statistical criteria, but their \alert{discovery} will probably be a \alert{false positive}. - \hfill Five ways to fix statistics, Nature, 551, Nov 2017. + \vspace{3mm} + \small + \hfill Five ways to fix statistics (Nature, 551, Nov 2017; DOI:\textcolor{blue}{\href{https://doi.org/10.1038/d41586-017-07522-z}{10.1038/d41586-017-07522-z}}). \end{tcolorbox} \end{frame} @@ -1185,7 +1255,10 @@ for computational reproducibility] - + \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} + \newcommand{\tomorrow}{1} + \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} + \newcommand{\abstractify}{1} \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \newcommand{\projinit}{} \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} @@ -1226,6 +1299,12 @@ for computational reproducibility] + \begin{frame}{Any Git-based workflow is possible.} + \centering + \includegraphics[width=1.2\linewidth]{img/figure-branching.pdf} + \end{frame} + + \begin{frame}{Publication of the project} @@ -1242,17 +1321,25 @@ for computational reproducibility] \alert{negligible} compared to a single figure in a paper (usually $\sim100$ kilo-bytes). - \vspace{1cm} The project's pipeline (customized Maneage) can be + \pause + + \vspace{7mm} The project's pipeline (customized Maneage) can be \alert{published} in \begin{itemize} \item \alert{arXiv}: uploaded with the \LaTeX{} source to always stay with the paper \\(for example - \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). The - file containing all macros must also be uploaded so arXiv's - server can easily build the \LaTeX{} source. + \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}} or \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}). \item \alert{Zenodo}: Along with all the input datasets (many Gigabytes) and software \\(for example - \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.3408481}{zenodo.3408481}}) and given a unique DOI. + \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.3872247}{zenodo.3872247}}) and given a unique DOI. + \begin{itemize} + \item ... and put links to data in paper! See ending of caption of Figure 1 in the \textcolor{blue}{\href{https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9403875}{Maneage paper}}. + \end{itemize} + \item \alert{Software Heritage}: to archive the full version-controlled history of the project.\\(for example + {\small \textcolor{blue}{\href{https://archive.softwareheritage.org/swh:1:dir:33fea87068c1612daf011f161b97787b9a0df39f;origin=http://git.maneage.org/paper-concept.git/;visit=swh:1:snp:89af43c4b076a17d9298299f224247038af355ea;anchor=swh:1:rev:313db0b04bd3499f83d9e79fd7e92578cd367c2b}{swh:1:dir:33fea87068c1612daf011f161b97787b9a0df39fk}}}) + \begin{itemize} + \item ... and put links to exact parts of the code! See caption of Listing 1 in the \textcolor{blue}{\href{https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9403875}{Maneage paper}}. + \end{itemize} \end{itemize} \end{frame} @@ -1260,6 +1347,19 @@ for computational reproducibility] + \begin{frame}{Software Heritage IDs (SWHID); persistent identifier for source code (or any text!)} + + \vspace{5mm} + \includegraphics[width=\linewidth]{img/SWHIDs.png} + \vspace{5mm} + + {\hfill\small For more details, see SoftwareHeritage FAQ (at \textcolor{blue}{\url{https://www.softwareheritage.org/faq}}}) + \end{frame} + + + + + \begin{frame}{Project source and its execution} \begin{tcolorbox} Programs \textcolor{gray}{[here: Scientific projects]} must be @@ -1333,58 +1433,45 @@ for computational reproducibility] - \begin{frame}{RDA adoption grant (2019) to IAC for Maneage} - \begin{center} - \includegraphics[width=3cm]{img/rda.png}\hspace{1cm} - \includegraphics[width=1.8cm]{img/iac.png} - \includegraphics[width=\linewidth]{img/h2020.jpg} - \end{center} - - \vspace{1cm} For Maneage, the \alert{IAC} is selected as - a \alert{Top European organization} funded to adopt RDA - Recommendations and Outputs. - - \vspace{1cm} - \scriptsize - \begin{itemize} - \item Research Data Alliance was launched by the \alert{European - Commission}, NSF, National Institute of Standards and - Technology, and the Australian Government’s Department of - Innovation. - \item RDA Outputs are the technical and social infrastructure - solutions developed by RDA Working Groups or Interest - Groups that enable data sharing, exchange, and - interoperability. - \end{itemize} - - \vspace{0.2cm} - \end{frame} - - + \begin{frame}{Summary:} + Maneage is introduced as a customizable template that will do the + following steps/instructions (all in simple plain text files). + \begin{itemize} + \item \alert{Automatically downloads} the necessary + \emph{software} and \emph{data}. + \item \alert{Builds} the software in a \alert{closed + environment}. + \item Runs the software on data to \alert{generate} the final + \alert{research results}. + \item Only parts affected by a modifcation are re-done. + \item Using LaTeX macros, paper's figures, tables and numbers + will be \alert{Automatically updated}. + \item The whole project is under \alert{version control} (Git) + \alert{encouraging tests/experimentation}. + \item The \alert{Git commit hash} of the project source, is + \alert{printed} in the paper and \alert{on output} data + products. + \item \colorbox{green!30!white}{These slides are available at + \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.} + \end{itemize} - \begin{frame}{Workshop on Maneage at IAC: \alert{first week of April} (March 30th to April 3rd)} + \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, + top=1pt, bottom=1pt] + For a technical description of Maneage's implementation, as well + as a checklist to customize it, and tips on good practices, + please see this page: - We are organizing a workshop to help interested \alert{early career researchers} adopt Maneage. + \textcolor{blue}{\url{https://gitlab.com/maneage/project/-/blob/maneage/README-hacking.md}} - \vspace{5mm} - \begin{columns} - \column{0.25\linewidth} - \centering - \includegraphics[width=\linewidth]{img/rda-europe.png} - \column{0.5\linewidth} - \centering - \includegraphics[width=\linewidth]{img/workshop-shutterstock.png}\\ - {\tiny Image from \href{https://www.shutterstock.com/es/image-vector/managers-workshop-training-manager-skills-brainstorming-1334996078}{shutterstock.com}} + \vspace{3mm} + \hfill \colorbox{green!30!white}{Feel free to contact me: \textcolor{blue}{\large{\url{mohammad@akhlaghi.org}}}} + \end{tcolorbox} + \end{frame} +\end{document} - \column{0.25\linewidth} - \includegraphics[width=0.7\linewidth]{img/iac.png} - \end{columns} - \vspace{7mm} - Please contact \alert{akhlaghi@iac.es} to join (Space is very limited: it is hands-on). - \end{frame} @@ -1419,93 +1506,49 @@ In summary, they only \alert{store a built} environment (they are outputs, not g -\begin{frame}{Existing technologies (Package managers)} - - \begin{itemize} - \item \textbf{Operating system package managers:} - \begin{itemize} - \setlength\itemsep{2mm} - \item For example \texttt{apt} or \texttt{yum} for Debian-based and RedHat-based GNU/Linux operating systems\\(the most common way to install software). - \item Tightly intertwined with the operating system's components\\(arbitrary control of software versions is not easily possible). - \item Older software (for example +5 years) is usually removed. - \end{itemize} - \item \textbf{Conda/Anaconda:} - \begin{itemize} - \setlength\itemsep{2mm} - \item Conda has build instructions for software and their dependencies. - \item But it doesn't go down to the C library or the lower-level components of operating system. - \item It is written in Python (can't be used later when current Python is depreciated). - \item Authors of Uhse+2019\footnote{\url{http://dx.doi.org/10.1002/cppb.20097}} report\footnote{\url{https://github.com/conda-forge/conda-forge.github.io/issues/787}} that their Conda environment breaks roughly every 3 months\\(Conda environments need to be updated to be used later! Breaking reproducibility). - \end{itemize} - \item \textbf{Nix, or GNU Guix:} - \begin{itemize} - \setlength\itemsep{2mm} - \item Deliver perfectly reproducible builds (bit-wise reproducibility of software), needs root access. - \item Doesn't \emph{require} documentation of dependencies. - \end{itemize} - \item \textbf{Spack:} Similar to Nix/Guix but written in Python. - \end{itemize} -\end{frame} - -\begin{frame}{Existing technologies (workflow tools)} - \begin{itemize} - \setlength\itemsep{4mm} - \item \textbf{Binder:} (\url{https://mybinder.org}) Docker+Conda. - \item \textbf{Galaxy:} (\url{https://galaxyproject.org}) A web-based user interface, primarily designed for genomics. The GUI make it hard to automate, and has too many dependencies. Very similar to GenePattern (2008 to 2017): with +40,000 users and $\sim4000$ jobs running per week, but cut due to funding. - \item \textbf{Sciunit:} (\url{https://sciunit.run}) Parses program binaries to try to infer their dependencies and copy them. - \item \textbf{Popper:} (\url{https://falsifiable.us}), HCL (previously used by GitHub Actions) + Conda + Docker. - \item \textbf{WholeTale:} (\url{https://wholetale.org}) Jupyter + Conda + Docker. - \item \textbf{Image Processing On Line (IPOL) journal:} The best example of publishing algorithms/methods I have seen, only useful for very basic/low-level software. - \end{itemize} - \alert{Summary}: except for IPOL, most solutions surveyed have far too many dependencies to be usable \alert{beyond the immediate future}. +%\begin{frame}{Existing technologies (Package managers)} +% +% \begin{itemize} +% \item \textbf{Operating system package managers:} +% \begin{itemize} +% \setlength\itemsep{2mm} +% \item For example \texttt{apt} or \texttt{yum} for Debian-based and RedHat-based GNU/Linux operating systems\\(the most common way to install software). +% \item Tightly intertwined with the operating system's components\\(arbitrary control of software versions is not easily possible). +% \item Older software (for example +5 years) is usually removed. +% \end{itemize} +% \item \textbf{Conda/Anaconda:} +% \begin{itemize} +% \setlength\itemsep{2mm} +% \item Conda has build instructions for software and their dependencies. +% \item But it doesn't go down to the C library or the lower-level components of operating system. +% \item It is written in Python (can't be used later when current Python is depreciated). +% \item Authors of Uhse+2019\footnote{\url{http://dx.doi.org/10.1002/cppb.20097}} report\footnote{\url{https://github.com/conda-forge/conda-forge.github.io/issues/787}} that their Conda environment breaks roughly every 3 months\\(Conda environments need to be updated to be used later! Breaking reproducibility). +% \end{itemize} +% \item \textbf{Nix, or GNU Guix:} +% \begin{itemize} +% \setlength\itemsep{2mm} +% \item Deliver perfectly reproducible builds (bit-wise reproducibility of software), needs root access. +% \item Doesn't \emph{require} documentation of dependencies. +% \end{itemize} +% \item \textbf{Spack:} Similar to Nix/Guix but written in Python. +% \end{itemize} +%\end{frame} +% +%\begin{frame}{Existing technologies (workflow tools)} +% \begin{itemize} +% \setlength\itemsep{4mm} +% \item \textbf{Binder:} (\url{https://mybinder.org}) Docker+Conda. +% \item \textbf{Galaxy:} (\url{https://galaxyproject.org}) A web-based user interface, primarily designed for genomics. The GUI make it hard to automate, and has too many dependencies. Very similar to GenePattern (2008 to 2017): with +40,000 users and $\sim4000$ jobs running per week, but cut due to funding. +% \item \textbf{Sciunit:} (\url{https://sciunit.run}) Parses program binaries to try to infer their dependencies and copy them. +% \item \textbf{Popper:} (\url{https://falsifiable.us}), HCL (previously used by GitHub Actions) + Conda + Docker. +% \item \textbf{WholeTale:} (\url{https://wholetale.org}) Jupyter + Conda + Docker. +% \item \textbf{Image Processing On Line (IPOL) journal:} The best example of publishing algorithms/methods I have seen, only useful for very basic/low-level software. +% \end{itemize} +% \alert{Summary}: except for IPOL, most solutions surveyed have far too many dependencies to be usable \alert{beyond the immediate future}. \end{frame} - - \begin{frame}{Summary:} - - Maneage is introduced as a customizable template that will do the - following steps/instructions (all in simple plain text files). - \begin{itemize} - \item \alert{Automatically downloads} the necessary - \emph{software} and \emph{data}. - \item \alert{Builds} the software in a \alert{closed - environment}. - \item Runs the software on data to \alert{generate} the final - \alert{research results}. - \item A modification in one part of the analysis will only - result in re-doing that part, not the whole project. - \item Using LaTeX macros, paper's figures, tables and numbers - will be \alert{Automatically updated} after a change in - analysis. Allowing the scientist to focus on the scientific - interpretation. - \item The whole project is under \alert{version control} (Git) - to allow easy reversion to a previous state. This - \alert{encourages tests/experimentation} in the analysis. - \item The \alert{Git commit hash} of the project source, is - \alert{printed} in the published paper and \alert{saved on - output} data products. Ensuring the - integrity/reproducibility of the result. - \item \colorbox{green!30!white}{These slides are available at - \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.} - \end{itemize} - - \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, - top=1pt, bottom=1pt] - For a technical description of Maneage's implementation, as well - as a checklist to customize it, and tips on good practices, - please see this page: - - \textcolor{blue}{\footnotesize\url{https://gitlab.com/maneage/project/-/blob/maneage/README-hacking.md}} - \end{tcolorbox} - \end{frame} -\end{document} - - - - - % \begin{frame}{Funding to help adoption of template} % \begin{itemize} % \setlength\itemsep{5mm} diff --git a/tex/git-branch.tex b/tex/git-branch.tex index ad5b2c9..79b7e24 100644 --- a/tex/git-branch.tex +++ b/tex/git-branch.tex @@ -18,6 +18,7 @@ \begin{tikzpicture} \draw [white] (0,0) -- (0,7.4); + \draw [white] (-1cm,0) -- (5,0); %% Template branch. \ifdefined\tofuture @@ -26,7 +27,9 @@ \ifdefined\tempevolve \draw[->, line width=2mm] (0,0) -- (0,5.5); \else - \draw[->, line width=2mm] (0,0) -- (0,2.3); + \ifdefined\abstractify + \draw[->, line width=2mm] (0,0) -- (0,2.3); + \fi \fi \fi @@ -57,12 +60,22 @@ %% otherwise the black lines are going to be over the commit %% circles. - %% Template commits. - \draw[anchor=north] (0,0) node {\textbf{Maneage}}; - \draw [fill=green!80!blue, opacity=0.5] (0,0.55cm) circle [radius=2.1mm]; - \draw[anchor=east] (-5pt,0.55cm) node {\tiny\texttt{ad2c476}}; - \draw [fill=green!80!blue, opacity=0.5] (0,1.55cm) circle [radius=2.1mm]; - \draw[anchor=east] (-5pt,1.55cm) node {\tiny\texttt{706c644}}; + %% Maneage commits. + \ifdefined\abstractify + \draw[anchor=north] (0,0) node {\textbf{Maneage}}; + \draw [fill=green!80!blue, opacity=0.5] (0,0.55cm) circle [radius=2.1mm]; + \draw[anchor=east] (-5pt,0.55cm) node {\tiny\texttt{ad2c476}}; + \draw [fill=green!80!blue, opacity=0.5] (0,1.55cm) circle [radius=2.1mm]; + \draw[anchor=east] (-5pt,1.55cm) node {\tiny\texttt{706c644}}; + \else + \node [inner sep=0pt] at (0,0.55cm) {\includegraphics[width=1.5cm]{img/project-flow-small.png}}; + \draw[anchor=center] (0,0.55cm) node {\bf Today}; + \ifdefined\tomorrow + \node [inner sep=0pt] at (0,1.55cm) {\includegraphics[width=1.5cm]{img/project-flow-small.png}}; + \draw[anchor=center] (0,1.55cm) node {\bf Tomorrow}; + \else + \fi + \fi \ifdefined\tempevolve \draw [fill=green!80!blue, opacity=0.5] (0,2.55cm) circle [radius=2.1mm]; \draw[anchor=east] (-5pt,2.55cm) node {\tiny\texttt{fa2ac10}}; @@ -119,7 +132,11 @@ \vspace{-1cm} \begin{itemize} \setlength\itemsep{0.2cm} - \item Template's history is recorded in Git. + \ifdefined\abstractify + \item Each point of project's history is recorded with Git. + \else + \item The project (answers to questions above) will evolve. + \fi \ifdefined\projinit \item New project: a branch from the template.\\ Recall that \alert{every commit} contains the following: |