From d1faba600ae780fa7ba1994cf38504b75119d644 Mon Sep 17 00:00:00 2001 From: Mohammad Akhlaghi Date: Thu, 18 Jun 2020 02:52:26 +0100 Subject: Compled short slides, with simple macro to cut off increments The short version of the slides is now ready. There is a '\longformat' macro that will significantly increase the number of slides, but not substantially (they are just the incremental things). Some minor modifications were also made in the long version. --- img/data-lineage-1.pdf | Bin 5924 -> 6847 bytes img/data-lineage-10.pdf | Bin 13879 -> 26746 bytes img/data-lineage-11.pdf | Bin 14062 -> 29143 bytes img/data-lineage-12.pdf | Bin 14392 -> 30659 bytes img/data-lineage-13.pdf | Bin 14591 -> 31739 bytes img/data-lineage-14.pdf | Bin 14698 -> 33643 bytes img/data-lineage-15.pdf | Bin 14867 -> 40754 bytes img/data-lineage-16.pdf | Bin 15016 -> 0 bytes img/data-lineage-17.pdf | Bin 15061 -> 0 bytes img/data-lineage-18.pdf | Bin 15213 -> 0 bytes img/data-lineage-19.pdf | Bin 15593 -> 0 bytes img/data-lineage-2.pdf | Bin 6558 -> 8505 bytes img/data-lineage-20.pdf | Bin 15755 -> 0 bytes img/data-lineage-21.pdf | Bin 16009 -> 0 bytes img/data-lineage-22.pdf | Bin 16191 -> 0 bytes img/data-lineage-3.pdf | Bin 7353 -> 11884 bytes img/data-lineage-4.pdf | Bin 7627 -> 13709 bytes img/data-lineage-5.pdf | Bin 7804 -> 14821 bytes img/data-lineage-6.pdf | Bin 12608 -> 20722 bytes img/data-lineage-7.pdf | Bin 12804 -> 21780 bytes img/data-lineage-8.pdf | Bin 13040 -> 23239 bytes img/data-lineage-9.pdf | Bin 13725 -> 25089 bytes img/tools-per-year-orig.jpg | Bin 0 -> 46204 bytes img/tools-per-year.pdf | Bin 0 -> 11960 bytes slides-intro-short.tex | 298 +++++++++++++++++++++++--------------------- slides-intro.tex | 63 +++++----- tex/git-branch.tex | 2 +- 27 files changed, 193 insertions(+), 170 deletions(-) delete mode 100644 img/data-lineage-16.pdf delete mode 100644 img/data-lineage-17.pdf delete mode 100644 img/data-lineage-18.pdf delete mode 100644 img/data-lineage-19.pdf delete mode 100644 img/data-lineage-20.pdf delete mode 100644 img/data-lineage-21.pdf delete mode 100644 img/data-lineage-22.pdf create mode 100644 img/tools-per-year-orig.jpg create mode 100644 img/tools-per-year.pdf diff --git a/img/data-lineage-1.pdf b/img/data-lineage-1.pdf index ffa9df7..358f5e6 100644 Binary files a/img/data-lineage-1.pdf and b/img/data-lineage-1.pdf differ diff --git a/img/data-lineage-10.pdf b/img/data-lineage-10.pdf index 263b150..5b260da 100644 Binary files a/img/data-lineage-10.pdf and b/img/data-lineage-10.pdf differ diff --git a/img/data-lineage-11.pdf b/img/data-lineage-11.pdf index 913e31f..aae7615 100644 Binary files a/img/data-lineage-11.pdf and b/img/data-lineage-11.pdf differ diff --git a/img/data-lineage-12.pdf b/img/data-lineage-12.pdf index 9b29c45..0ecbd1d 100644 Binary files a/img/data-lineage-12.pdf and b/img/data-lineage-12.pdf differ diff --git a/img/data-lineage-13.pdf b/img/data-lineage-13.pdf index 65b8b02..07baeca 100644 Binary files a/img/data-lineage-13.pdf and b/img/data-lineage-13.pdf differ diff --git a/img/data-lineage-14.pdf b/img/data-lineage-14.pdf index 31bfa27..0c0c9e9 100644 Binary files a/img/data-lineage-14.pdf and b/img/data-lineage-14.pdf differ diff --git a/img/data-lineage-15.pdf b/img/data-lineage-15.pdf index 409cee4..f925185 100644 Binary files a/img/data-lineage-15.pdf and b/img/data-lineage-15.pdf differ diff --git a/img/data-lineage-16.pdf b/img/data-lineage-16.pdf deleted file mode 100644 index d924ec2..0000000 Binary files a/img/data-lineage-16.pdf and /dev/null differ diff --git a/img/data-lineage-17.pdf b/img/data-lineage-17.pdf deleted file mode 100644 index 5ac0675..0000000 Binary files a/img/data-lineage-17.pdf and /dev/null differ diff --git a/img/data-lineage-18.pdf b/img/data-lineage-18.pdf deleted file mode 100644 index a800a44..0000000 Binary files a/img/data-lineage-18.pdf and /dev/null differ diff --git a/img/data-lineage-19.pdf b/img/data-lineage-19.pdf deleted file mode 100644 index 7513caa..0000000 Binary files a/img/data-lineage-19.pdf and /dev/null differ diff --git a/img/data-lineage-2.pdf b/img/data-lineage-2.pdf index e338e82..d79231f 100644 Binary files a/img/data-lineage-2.pdf and b/img/data-lineage-2.pdf differ diff --git a/img/data-lineage-20.pdf b/img/data-lineage-20.pdf deleted file mode 100644 index 13ffab5..0000000 Binary files a/img/data-lineage-20.pdf and /dev/null differ diff --git a/img/data-lineage-21.pdf b/img/data-lineage-21.pdf deleted file mode 100644 index c7d4372..0000000 Binary files a/img/data-lineage-21.pdf and /dev/null differ diff --git a/img/data-lineage-22.pdf b/img/data-lineage-22.pdf deleted file mode 100644 index f2155aa..0000000 Binary files a/img/data-lineage-22.pdf and /dev/null differ diff --git a/img/data-lineage-3.pdf b/img/data-lineage-3.pdf index a8a5671..9d0e77b 100644 Binary files a/img/data-lineage-3.pdf and b/img/data-lineage-3.pdf differ diff --git a/img/data-lineage-4.pdf b/img/data-lineage-4.pdf index 6d686a7..1eb0255 100644 Binary files a/img/data-lineage-4.pdf and b/img/data-lineage-4.pdf differ diff --git a/img/data-lineage-5.pdf b/img/data-lineage-5.pdf index 3ae9baf..3a6298f 100644 Binary files a/img/data-lineage-5.pdf and b/img/data-lineage-5.pdf differ diff --git a/img/data-lineage-6.pdf b/img/data-lineage-6.pdf index 374de91..38e1731 100644 Binary files a/img/data-lineage-6.pdf and b/img/data-lineage-6.pdf differ diff --git a/img/data-lineage-7.pdf b/img/data-lineage-7.pdf index b590898..6dc481e 100644 Binary files a/img/data-lineage-7.pdf and b/img/data-lineage-7.pdf differ diff --git a/img/data-lineage-8.pdf b/img/data-lineage-8.pdf index 7dd6425..c1b6d09 100644 Binary files a/img/data-lineage-8.pdf and b/img/data-lineage-8.pdf differ diff --git a/img/data-lineage-9.pdf b/img/data-lineage-9.pdf index 29c6e5a..2211343 100644 Binary files a/img/data-lineage-9.pdf and b/img/data-lineage-9.pdf differ diff --git a/img/tools-per-year-orig.jpg b/img/tools-per-year-orig.jpg new file mode 100644 index 0000000..049f7d5 Binary files /dev/null and b/img/tools-per-year-orig.jpg differ diff --git a/img/tools-per-year.pdf b/img/tools-per-year.pdf new file mode 100644 index 0000000..8890226 Binary files /dev/null and b/img/tools-per-year.pdf differ diff --git a/slides-intro-short.tex b/slides-intro-short.tex index 88c14aa..79506ea 100644 --- a/slides-intro-short.tex +++ b/slides-intro-short.tex @@ -18,6 +18,9 @@ % Basic LaTeX settings. \documentclass[9pt,usenames,dvipsnames,aspectratio=169]{beamer} +% Make it super short. +%\newcommand{\longformat}{} + % Read the current Git commit information \include{git-commit} \include{tex/preamble} @@ -34,8 +37,8 @@ (\textcolor{blue}{\href{https://arxiv.org/abs/#1}{arXiv:#1}})}} %% Set the title -\title{{\huge\textbf{BIG} Data, \textbf{BIG} responsibility} \\ - {\LARGE Towards Long-term and Archivable Reproducibility} \\ +\title{Introducing Maneage:\\ + Customizable framework for managing data lineage\\ \vspace{2mm}{\small [RDA Europe Adoption grant recipient. Submitted to \href{https://www.computer.org/csdl/magazine/cs}{IEEE CiSE} (\textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}), Comments welcome]} } @@ -89,6 +92,7 @@ \begin{frame} \titlepage \end{frame} + \usebackgroundtemplate{ } %% undeclare it \begin{frame}{Challenges of the RDA-WDS Publishing Data Workflows WG {\small (DOI:\href{https://doi.org/10.1007/s00799-016-0178-2}{10.1007/s00799-016-0178-2})}} @@ -105,7 +109,8 @@ \includegraphics[width=4cm]{img/wds.jpg} \end{center} - \pause + \ifdefined\longformat\pause\fi + ``\emph{We would like to see a workflow that results in all \textcolor{blue!30!green}{\bf scholarly objects being connected}, linked, citable, and persistent to allow researchers to navigate @@ -118,13 +123,14 @@ discoveries.}'' \end{frame} -\usebackgroundtemplate{ } %% undeclare it - - \newcommand{\allopacity}{1} + \ifdefined\longformat \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} + \fi \newcommand{\paperinit}{} + \ifdefined\longformat \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} + \fi \newcommand{\sver}{} \newcommand{\srep}{} \newcommand{\dver}{} @@ -189,7 +195,7 @@ result. \end{tcolorbox} - \pause + \ifdefined\longformat\pause\fi \begin{itemize} \item \textbf{Complete/self-contained:} \begin{itemize} @@ -199,25 +205,25 @@ \item Should be usable \alert{without internet} connection. \end{itemize} - \pause + \ifdefined\longformat\pause\fi \item \textbf{Modularity:} Parts of the project should be \alert{re-usable} in other projects. - \pause + \ifdefined\longformat\pause\fi \item \textbf{Plain text:} Project's source should be in \alert{plain-text} \textcolor{gray}{(binary formats need special software)} \begin{itemize} \item This includes high-level analysis. \item It is easily publishable (very low volume of $\times100$KB), archivable, and parse-able. \item \alert{Version control} (e.g., with Git) can track project's history. \end{itemize} - \pause + \ifdefined\longformat\pause\fi \item \textbf{Minimal complexity:} Occum’s rasor: “Never posit pluralities without necessity”. \begin{itemize} \item Avoiding the \alert{fashionable} tool of the day: tomorrow another tool will take its place! \item Easier \alert{learning curve}, also doesn't create a \alert{generational gap}. \item Is \alert{compatible} and \alert{extensible}. \end{itemize} - \pause + \ifdefined\longformat\pause\fi \item \textbf{Verifable inputs and outputs:} Inputs and Outputs must be \alert{automatically verified}. - \pause + \ifdefined\longformat\pause\fi \item \textbf{Free and open source software:} \alert{Free software} is essential: non-free software is not configurable, not distributable, and dependent on non-free provider (which may discontinue it in N years). \end{itemize} \end{frame} @@ -231,7 +237,7 @@ - + \ifdefined\longformat \begin{frame}{Predefined/exact software tools} \small \begin{columns} @@ -266,7 +272,7 @@ \includegraphics[width=\linewidth]{img/version.png} \end{columns} \end{frame} - + \fi @@ -310,30 +316,6 @@ - \begin{frame}{Controlled environment and build instructions} - \small - \begin{columns} - \column{5.5cm} - \includegraphics[width=0.9\linewidth]{img/env.png} - \column{5.5cm} - \includegraphics[width=0.9\linewidth]{img/build.png} - \end{columns} - \end{frame} - - \begin{frame}{Controlled environment and build instructions} - \small - \begin{columns} - \column{5.5cm} - \includegraphics[width=0.9\linewidth]{img/env-highlighted.png} - \column{5.5cm} - \includegraphics[width=0.9\linewidth]{img/build-highlighted.png} - \end{columns} - \end{frame} - - - - - \begin{frame}{Example: Matplotlib (a Python visualization library) build dependencies} \Wider[5em]{ %\vspace{5mm} @@ -343,8 +325,7 @@ \vspace{3mm}\tiny From ``Attributing and Referencing (Research) Software: Best Practices and Outlook from Inria'' (Alliez et - al. 2019, - \textcolor{blue}{\href{https://hal.archives-ouvertes.fr/hal-02135891}{hal-02135891}}) + al. 2020, CiSE, DOI:\textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2019.2949413}{10.1109/MCSE.2019.2949413}}). } \end{frame} @@ -379,10 +360,14 @@ \end{frame} + + + \ifdefined\longformat \begin{frame}{Software citation automatically generated in paper (including Astropy)} \centering \includegraphics[width=0.8\linewidth]{img/software-cite.jpg} \end{frame} + \fi \begin{frame}{Software citation automatically generated in paper (including Astropy)} \centering \includegraphics[width=0.8\linewidth]{img/software-cite-highlighted.jpg} @@ -393,6 +378,7 @@ \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} \let\focusonhardware\undefined + \ifdefined\longformat \begin{frame}{Input data source and integrity is documented and checked} \small \begin{columns} @@ -418,6 +404,7 @@ \includegraphics[width=\linewidth]{img/inputs.png} \end{columns} \end{frame} + \fi \begin{frame}{Input data source and integrity is documented and checked} \small @@ -463,7 +450,7 @@ - + \ifdefined\longformat \begin{frame}{Reproducible science: Maneage is managed through a Makefile} \small \begin{columns} @@ -530,6 +517,7 @@ \includegraphics[width=\linewidth]{img/reproducible-makefile-highlighted-1.png} \end{columns} \end{frame} + \fi \begin{frame}{Reproducible science: Maneage is managed through a Makefile} \small \begin{columns} @@ -592,7 +580,7 @@ \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame} \let\focusonpaper\undefined - + \ifdefined\longformat \begin{frame}{Values in final report/paper} All analysis \alert{results} (numbers, plots, tables) written in paper's PDF as \alert{\LaTeX{} macros}. They are thus @@ -603,6 +591,7 @@ \vspace{0.4cm} \includegraphics[width=\linewidth]{img/reproducible-latex.png} \end{frame} + \fi \begin{frame}{Values in final report/paper} All analysis \alert{results} (numbers, plots, tables) written in @@ -618,13 +607,14 @@ - + \ifdefined\longformat \begin{frame}{Analysis step results/values concatenated into a single file.} All \LaTeX{} macros come from a \alert{single file}. \begin{center} \includegraphics[width=0.6\linewidth]{img/reproducible-macros.png} \end{center} \end{frame} + \fi \begin{frame}{Analysis step results/values concatenated into a single file.} All \LaTeX{} macros come from a \alert{single file}. \begin{center} @@ -637,7 +627,7 @@ - + \ifdefined\longformat \begin{frame}{Analysis results stored as \LaTeX{} macros} The analysis scripts write/update the \LaTeX{} macro values automatically. @@ -645,6 +635,7 @@ \includegraphics[width=0.6\linewidth]{img/reproducible-write-macro.png} \end{center} \end{frame} + \fi \begin{frame}{Analysis results stored as \LaTeX{} macros} The analysis scripts write/update the \LaTeX{} macro values automatically. @@ -654,12 +645,35 @@ \end{frame} + %% Make demo. - \begin{frame} - \LARGE - \vspace{1cm} - \hfill Let's see how the analysis is managed in a hypothetical project... + \begin{frame}{Let's look at the data lineage to replicate Figure 1C (green/tool) of Menke+2020 \\(DOI:\href{https://doi.org/10.1101/2020.01.15.908111}{10.1101/2020.01.15.908111}), as done in \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}} for a demo.} + \begin{columns} + \column{0.55\linewidth} + \textcolor{blue}{ORIGINAL PLOT} + + The Green plot shows the fraction of papers mentioning software tools from 1997 to 2019. + \column{0.45\linewidth} + \includegraphics[width=\linewidth]{img/tools-per-year-orig.jpg} + \end{columns} + + \rule{\textwidth}{1pt} + + \begin{columns} + \column{0.4\linewidth} + \textcolor{green!70!black}{OUR enhanced REPLICATION} + + The green line is same as above but over their full historical range. + + Red histogram is the number of papers studied in each year + \column{0.6\linewidth} + \vspace{1cm} + \includegraphics[width=\linewidth]{img/tools-per-year.pdf} + \end{columns} \end{frame} + + + \ifdefined\longformat \makedemoslide{img/data-lineage-1.pdf} {Makefiles (\texttt{\*.mk}) keep contextually separate parts of the project, all imported into \texttt{top-make.mk}} \makedemoslide{img/data-lineage-2.pdf} @@ -673,37 +687,31 @@ \makedemoslide{img/data-lineage-6.pdf} {Basic project info comes from \texttt{initialize.tex}.} \makedemoslide{img/data-lineage-7.pdf} - {Reported values about the downloaded inputs come from \texttt{download.tex}.} + {The paper includes some information about the plot.} \makedemoslide{img/data-lineage-8.pdf} - {... for example the number of rows in the second input (a catalog) of the project.} + {The final plotted data are calculated and stored in \texttt{tools-per-year.txt}.} \makedemoslide{img/data-lineage-9.pdf} - {The URL to download \texttt{input2.dat}, and a checksum to validate it, are stored in \texttt{INPUTS.conf}.} + {The plot's calculation is done on a formatted sub-set of the raw input data.} \makedemoslide{img/data-lineage-10.pdf} - {Reported values from first analysis steps stored in \texttt{analysis1.tex}.} + {The raw data that were downloaded are stored in XLSX format.} \makedemoslide{img/data-lineage-11.pdf} - {... for example the average of the numbers in \texttt{out-1b.dat}.} + {The download URL \emph{and} a \alert{checksum to validate} the raw inputs, are stored in \texttt{INPUTS.conf}.} \makedemoslide{img/data-lineage-12.pdf} - {But \texttt{out-1b.dat} itself depends on other files and a paramter (for example a multiple of sigma).} + {We also need to report the URL in the paper...} \makedemoslide{img/data-lineage-13.pdf} - {\texttt{out-1a.dat} is built from a downloaded dataset.} + {Some general info about the full dataset may also be reported.} + \fi + + \ifdefined\longformat + \makedemoslide{img/data-lineage-14.pdf} + {We report the number of papers studied in a special year, desired year is stored in \texttt{.conf} file.} + \else \makedemoslide{img/data-lineage-14.pdf} - {Download URL and checksum of \texttt{input1.dat} also stored in \texttt{INPUTS.conf}.} + {All analysis steps cascade down to paper.pdf (URL and checksum of input in \texttt{INPUTS.conf}).} + \fi + \makedemoslide{img/data-lineage-15.pdf} - {Reported values from second analysis steps stored in \texttt{analysis2.tex}.} - \makedemoslide{img/data-lineage-16.pdf} - {... for example the number of selected rows in \texttt{out-2b.dat}.} - \makedemoslide{img/data-lineage-17.pdf} - {\texttt{out-2b.dat} is derived from \texttt{out-1b.dat} (for example, rejected some of \texttt{out-1b.dat}'s rows).} - \makedemoslide{img/data-lineage-18.pdf} - {Reported values from third analysis steps stored in \texttt{analysis3.tex}.} - \makedemoslide{img/data-lineage-19.pdf} - {... for example measurements from both \texttt{out-3a.dat} and \texttt{out-3b.dat}.} - \makedemoslide{img/data-lineage-20.pdf} - {\texttt{out-3b.dat} is generated from an analysis on \texttt{out-2a.dat}.} - \makedemoslide{img/data-lineage-21.pdf} - {But \texttt{out-2a.dat} itself is generated from \texttt{input1.dat} and an analysis which has two settings.} - \makedemoslide{img/data-lineage-22.pdf} - {\texttt{out-3a.dat} also depends on \texttt{out-1a.dat} and an analysis with needs one parameter.} + {It is very easy to expand the project and add new analysis steps (this solution is scalable)} @@ -742,7 +750,7 @@ - + \ifdefined\longformat \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \newcommand{\projinit}{} \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} @@ -757,8 +765,18 @@ \newcommand{\githappy}{} \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} \newcommand{\gitverified}{} + \else + \newcommand{\projinit}{} + \newcommand{\projwork}{} + \newcommand{\tempevolve}{} + \newcommand{\mergewithtemp}{} + \newcommand{\tofuture}{} + \newcommand{\githappy}{} + \newcommand{\gitverified}{} + \fi \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame} + \ifdefined\longformat \begin{frame}{Two recent examples (publishing Git checksum in abstract)} \begin{columns} \column{0.5\linewidth} @@ -769,6 +787,7 @@ \includegraphics[width=0.8\linewidth]{img/firstpage-mnras491.png} \end{columns} \end{frame} + \fi \begin{frame}{Two recent examples (publishing Git checksum in abstract)} \begin{columns} @@ -804,12 +823,10 @@ \begin{itemize} \item \alert{arXiv}: uploaded with the \LaTeX{} source to always stay with the paper \\(for example - \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}). The - file containing all macros must also be uploaded so arXiv's - server can easily build the \LaTeX{} source. + \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}} or \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}). \item \alert{Zenodo}: Along with all the input datasets (many Gigabytes) and software \\(for example - \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.3408481}{zenodo.3408481}}) and given a unique DOI. + \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.3872248}{zenodo.3872248}}) and given a unique DOI. \end{itemize} \end{frame} @@ -817,39 +834,36 @@ - \begin{frame}{Project source and its execution} - \begin{tcolorbox} - Programs \textcolor{gray}{[here: Scientific projects]} must be - written for \alert{people to read}... - - \hfill ...and only \emph{incidentally} for machines to - \emph{execute}. - - \vspace{2mm} - \hfill \footnotesize Harold Abelson, Structure and Interpretation of Computer Programs - \end{tcolorbox} - \end{frame} - - - - - - \begin{frame}[t]{General outline of using this system (for example \href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230})} + \begin{frame}[t]{General outline of using Maneage (for example \href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018})} \vspace{1cm} - \texttt{\$ git clone http://gitlab.com/makhlaghi/iau-symposium-355{ }{ }{ }{ }\textcolor{gray}{\# Import the project.}}\\ + \texttt{\$ git clone https://gitlab.com/makhlaghi/maneage-paper{ }{ }{ }{ }\textcolor{gray}{\# Import the project.}}\\ - \pause + \ifdefined\longformat\pause\fi \vspace{1.5cm} \texttt{\$ ./project configure { }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# You will specify the build directory on your system,}}\\ \texttt{{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# and it will build all software (about 1.5 hours).}} - \pause + \ifdefined\longformat\pause\fi \vspace{1.5cm} \texttt{\$ ./project make { }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# Does all the analysis and makes final PDF.}}\\ \end{frame} + \ifdefined\longformat + \begin{frame}{Project source and its execution} + \begin{tcolorbox} + Programs \textcolor{gray}{[here: Scientific projects]} must be + written for \alert{people to read}... + + \hfill ...and only \emph{incidentally} for machines to + \emph{execute}. + + \vspace{2mm} + \hfill \footnotesize Harold Abelson, Structure and Interpretation of Computer Programs + \end{tcolorbox} + \end{frame} + \fi @@ -887,6 +901,52 @@ \end{itemize} \end{frame} + + + + + \begin{frame}{Summary:} + + Maneage and its principles are described in \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}. + It is a customizable template that will do the following steps/instructions (all in simple plain text files). + \begin{itemize} + \item \alert{Automatically downloads} the necessary + \emph{software} and \emph{data}. + \item \alert{Builds} the software in a \alert{closed + environment}. + \item Runs the software on data to \alert{generate} the final + \alert{research results}. + \item Modification of part of the analysis will only + result in re-doing that part, not the whole project. + \item Using LaTeX macros, paper's figures, tables and numbers + will be \alert{Automatically updated} after a change in + analysis. Allowing the scientist to focus on the scientific + interpretation. + \item The whole project is under \alert{version control} (Git) + to allow easy reversion to a previous state. This + \alert{encourages tests/experimentation} in the analysis. + \item The \alert{Git commit hash} of the project source, is + \alert{printed} in the published paper and \alert{saved on + output} data products. Ensuring the + integrity/reproducibility of the result. + \item \colorbox{green!30!white}{These slides are available at + \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro-short.pdf}}.} + \item \colorbox{green!15!white}{Longer slides are available at + \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.} + \end{itemize} + + \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, + top=1pt, bottom=1pt] + For a technical description of Maneage's implementation, as well + as a checklist to customize it, and tips on good practices, + please see this page: + + \textcolor{blue}{\footnotesize\url{https://gitlab.com/maneage/project/-/blob/maneage/README-hacking.md}} + \end{tcolorbox} + \end{frame} + + +\ifdefined\longformat \begin{frame}{Existing technologies (Independent environment)} \begin{itemize} \setlength\itemsep{7mm} @@ -910,13 +970,11 @@ \vspace{3mm} In summary, they only \alert{store a built} environment (they are outputs, not good for archiving). - \end{frame} - \begin{frame}{Existing technologies (Package managers)} \begin{itemize} @@ -957,45 +1015,5 @@ In summary, they only \alert{store a built} environment (they are outputs, not g \end{itemize} \alert{Summary}: except for IPOL, most solutions surveyed have far too many dependencies to be usable \alert{beyond the immediate future}. \end{frame} - - - - - \begin{frame}{Summary:} - - Maneage is introduced as a customizable template that will do the - following steps/instructions (all in simple plain text files). - \begin{itemize} - \item \alert{Automatically downloads} the necessary - \emph{software} and \emph{data}. - \item \alert{Builds} the software in a \alert{closed - environment}. - \item Runs the software on data to \alert{generate} the final - \alert{research results}. - \item A modification in one part of the analysis will only - result in re-doing that part, not the whole project. - \item Using LaTeX macros, paper's figures, tables and numbers - will be \alert{Automatically updated} after a change in - analysis. Allowing the scientist to focus on the scientific - interpretation. - \item The whole project is under \alert{version control} (Git) - to allow easy reversion to a previous state. This - \alert{encourages tests/experimentation} in the analysis. - \item The \alert{Git commit hash} of the project source, is - \alert{printed} in the published paper and \alert{saved on - output} data products. Ensuring the - integrity/reproducibility of the result. - \item \colorbox{green!30!white}{These slides are available at - \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.} - \end{itemize} - - \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt, - top=1pt, bottom=1pt] - For a technical description of Maneage's implementation, as well - as a checklist to customize it, and tips on good practices, - please see this page: - - \textcolor{blue}{\footnotesize\url{https://gitlab.com/maneage/project/-/blob/maneage/README-hacking.md}} - \end{tcolorbox} - \end{frame} +\fi \end{document} diff --git a/slides-intro.tex b/slides-intro.tex index f4fcf97..3ffedf9 100644 --- a/slides-intro.tex +++ b/slides-intro.tex @@ -383,8 +383,7 @@ for computational reproducibility] \vspace{3mm}\tiny From ``Attributing and Referencing (Research) Software: Best Practices and Outlook from Inria'' (Alliez et - al. 2019, - \textcolor{blue}{\href{https://hal.archives-ouvertes.fr/hal-02135891}{hal-02135891}}) + al. 2020, CiSE, DOI:\textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2019.2949413}{10.1109/MCSE.2019.2949413}}). } \end{frame} \begin{frame}{Impact of ``Dependency hell'' on native building in various hardware (CPU architectures)} @@ -1048,10 +1047,29 @@ for computational reproducibility] %% Make demo. - \begin{frame} - \LARGE - \vspace{1cm} - \hfill Let's see how the analysis is managed in a hypothetical project... + \begin{frame}{Let's look at the data lineage to replicate Figure 1C (green/tool) of Menke+2020 \\(DOI:\href{https://doi.org/10.1101/2020.01.15.908111}{10.1101/2020.01.15.908111})} + \begin{columns} + \column{0.55\linewidth} + \textcolor{blue}{ORIGINAL PLOT} + + The Green plot shows the fraction of papers mentioning software tools from 1997 to 2019. + \column{0.45\linewidth} + \includegraphics[width=\linewidth]{img/tools-per-year-orig.jpg} + \end{columns} + + \rule{\textwidth}{1pt} + + \begin{columns} + \column{0.4\linewidth} + \textcolor{green!70!black}{OUR enhanced REPLICATION} + + The green line is same as above but over their full historical range. + + Red histogram is the number of papers studied in each year + \column{0.6\linewidth} + \vspace{1cm} + \includegraphics[width=\linewidth]{img/tools-per-year.pdf} + \end{columns} \end{frame} \makedemoslide{img/data-lineage-1.pdf} {Makefiles (\texttt{\*.mk}) keep contextually separate parts of the project, all imported into \texttt{top-make.mk}} @@ -1066,37 +1084,24 @@ for computational reproducibility] \makedemoslide{img/data-lineage-6.pdf} {Basic project info comes from \texttt{initialize.tex}.} \makedemoslide{img/data-lineage-7.pdf} - {Reported values about the downloaded inputs come from \texttt{download.tex}.} + {The paper includes some information about the plot.} \makedemoslide{img/data-lineage-8.pdf} - {... for example the number of rows in the second input (a catalog) of the project.} + {The final plotted data are calculated and stored in \texttt{tools-per-year.txt}.} \makedemoslide{img/data-lineage-9.pdf} - {The URL to download \texttt{input2.dat}, and a checksum to validate it, are stored in \texttt{INPUTS.conf}.} + {The plot's calculation is done on a formatted sub-set of the raw input data.} \makedemoslide{img/data-lineage-10.pdf} - {Reported values from first analysis steps stored in \texttt{analysis1.tex}.} + {The raw data that were downloaded are stored in XLSX format.} \makedemoslide{img/data-lineage-11.pdf} - {... for example the average of the numbers in \texttt{out-1b.dat}.} + {The download URL \emph{and} a \alert{checksum to validate} the raw inputs, are stored in \texttt{INPUTS.conf}.} \makedemoslide{img/data-lineage-12.pdf} - {But \texttt{out-1b.dat} itself depends on other files and a paramter (for example a multiple of sigma).} + {We also need to report the URL in the paper...} \makedemoslide{img/data-lineage-13.pdf} - {\texttt{out-1a.dat} is built from a downloaded dataset.} + {Some general info about the full dataset may also be reported.} \makedemoslide{img/data-lineage-14.pdf} - {Download URL and checksum of \texttt{input1.dat} also stored in \texttt{INPUTS.conf}.} + {We report the number of papers studied in a special year, desired year is stored in \texttt{.conf} file.} + \makedemoslide{img/data-lineage-15.pdf} - {Reported values from second analysis steps stored in \texttt{analysis2.tex}.} - \makedemoslide{img/data-lineage-16.pdf} - {... for example the number of selected rows in \texttt{out-2b.dat}.} - \makedemoslide{img/data-lineage-17.pdf} - {\texttt{out-2b.dat} is derived from \texttt{out-1b.dat} (for example, rejected some of \texttt{out-1b.dat}'s rows).} - \makedemoslide{img/data-lineage-18.pdf} - {Reported values from third analysis steps stored in \texttt{analysis3.tex}.} - \makedemoslide{img/data-lineage-19.pdf} - {... for example measurements from both \texttt{out-3a.dat} and \texttt{out-3b.dat}.} - \makedemoslide{img/data-lineage-20.pdf} - {\texttt{out-3b.dat} is generated from an analysis on \texttt{out-2a.dat}.} - \makedemoslide{img/data-lineage-21.pdf} - {But \texttt{out-2a.dat} itself is generated from \texttt{input1.dat} and an analysis which has two settings.} - \makedemoslide{img/data-lineage-22.pdf} - {\texttt{out-3a.dat} also depends on \texttt{out-1a.dat} and an analysis with needs one parameter.} + {It is very easy to expand the project and add new analysis steps (this solution is scalable)} diff --git a/tex/git-branch.tex b/tex/git-branch.tex index fa35e0d..ad5b2c9 100644 --- a/tex/git-branch.tex +++ b/tex/git-branch.tex @@ -153,7 +153,7 @@ \end{itemize} \ifdefined\gitverified - \vspace{3mm} + \vspace{-1mm} \hfill\tiny ``Verified'' image from \href{https://www.vectorstock.com/royalty-free-vector/red-vintage-verified-stamp-retro-style-on-white-vector-22770076}{vectorstock.com} \fi \end{columns} -- cgit v1.2.1