aboutsummaryrefslogtreecommitdiff
path: root/slides-intro-short.tex
blob: aba292d393a46cee996bb6d89cda67a9fe22fca7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
% LaTeX source of slides on reproducible paper.
%
% Copyright (C) 2020 Mohammad Akhlaghi <mohammad@akhlaghi.org>
%
% This LaTeX source is free software: you can redistribute it and/or
% modify it under the terms of the GNU General Public License as
% published by the Free Software Foundation, either version 3 of the
% License, or (at your option) any later version.
%
% This LaTeX source is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
% General Public License for more details.
%
% You should have received a copy of the GNU General Public License
% along with this LaTeX source.  If not, see <https://www.gnu.org/licenses/>.

% Basic LaTeX settings.
\documentclass[9pt,usenames,dvipsnames,aspectratio=169]{beamer}

% Make it super short.
\newcommand{\longformat}{}

% Read the current Git commit information
\include{git-commit}
\include{tex/preamble}

%% Beamer settings.
%\setbeamertemplate{footline}[frame number]

%% Packages to import.
\usepackage{tcolorbox}          %For a color-box.
\usepackage{textcomp}           %For a copyright sign.

%% To simplify arXiv links
\newcommand{\arxivlink}[1]{{\footnotesize
    (\textcolor{blue}{\href{https://arxiv.org/abs/#1}{arXiv:#1}})}}

%% Set the title
\title{Introducing Maneage:\\
  Customizable framework for managing data lineage\\
  \vspace{2mm}{\small [RDA Europe Adoption grant recipient. Submitted to \href{https://www.computer.org/csdl/magazine/cs}{IEEE CiSE} (\textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}), Comments welcome]}
}

%% Set the author
\author{\vspace{8mm}\\
  \href{https://akhlaghi.org}{Mohammad Akhlaghi}\\\vspace{0.5mm}
  \footnotesize
  Instituto de Astrof\'isica de Canarias ({\scriptsize IAC}), Tenerife, Spain
}

%% Set the date and insitutional logos.
\date{\footnotesize\vspace{0cm}\\
  \href{https://www.bsc.es/news/events/rda-spain-webinar}{RDA Spain webinar}\\July 9th, 2020\\
  \tiny\vspace{3mm}
  Most recent slides available in link below (this PDF is built from \href{http://git.maneage.org/slides-intro.git}{Git commit} \gitcommit):\\
  \footnotesize\textcolor{blue}{\url{https://maneage.org/pdf/slides-intro-short.pdf}}\\
  \vspace{2mm}\hspace{-0.25cm}
  \raisebox{+0.4\height}{\includegraphics[width=2.5cm]{img/ministerio-ciencia.png}}
  \raisebox{+0.3\height}{\includegraphics[width=1.3cm]{img/sundial.png}}
  \includegraphics[width=1.2cm]{img/iac.png}
  \includegraphics[width=1cm]{img/eu-sundial.png}
  \raisebox{0.13\height}{\includegraphics[width=1cm]{img/eu-regional.png}}
  \raisebox{0.05\height}{\includegraphics[width=1cm]{img/eu-rdaeu4.png}}
  \raisebox{+0.1\height}{\includegraphics[width=1.4cm]{img/rda-europe.png}}
  \raisebox{+1.3\height}{\includegraphics[width=1.4cm]{img/ull.png}}
  { }\raisebox{+0.5\height}{\includegraphics[width=2cm]{img/gobierno-canarias.png}}\\
  \vspace{1cm}
}




















\begin{document}

  \begin{frame}
    \titlepage
  \end{frame}
  \usebackgroundtemplate{ }    %% undeclare it


  \begin{frame}{Challenges of the RDA-WDS Publishing Data Workflows WG {\small (DOI:\href{https://doi.org/10.1007/s00799-016-0178-2}{10.1007/s00799-016-0178-2})}}
    Challenges (also relevant to researchers, not just repositories)
    \begin{itemize}
    \item \emph{Bi-directional linking}: how to \alert{link data and publications}.
    \item \emph{\alert{Software management}:} how to manage, preserve, publish and cite software?
    \item \emph{Metrics:} \alert{how often} are data used.
    \item \emph{Incentives to researchers:} how to \alert{communicate benefits} of following good practices \alert{to researchers}.
    \end{itemize}

    \begin{center}
      \includegraphics[width=4cm]{img/rda.png}\hspace{1cm}
      \includegraphics[width=4cm]{img/wds.jpg}
    \end{center}

    \ifdefined\longformat\pause\fi

    ``\emph{We would like to see a workflow that results in all
      \textcolor{blue!30!green}{\bf scholarly objects being connected},
      linked, citable, and persistent to allow researchers to navigate
      smoothly and to \alert{\bf enable reproducible research}.  This
      includes \alert{{\bf linkages} between documentation, code, data, and
        journal articles in an integrated environment}. Furthermore,
      in the ideal workflow, all of these objects need to be
      \alert{\bf well documented} to enable other researchers (or
      citizen scientists etc) to reuse the data for new
      discoveries.}''
  \end{frame}

  \newcommand{\allopacity}{1}
  \ifdefined\longformat
  \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
  \fi
  \newcommand{\paperinit}{}
  \ifdefined\longformat
  \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
  \fi
  \newcommand{\sver}{}
  \newcommand{\srep}{}
  \newcommand{\dver}{}
  \newcommand{\ddver}{}
  \newcommand{\confopt}{}
  \newcommand{\confenv}{}
  \newcommand{\containers}{}
  \newcommand{\db}{}
  \newcommand{\calib}{}
  \newcommand{\corr}{}
  \newcommand{\runord}{}
  \newcommand{\runopt}{}
  \newcommand{\humanerr}{}
  \newcommand{\confirmbias}{}
  \newcommand{\depupdate}{}
  \newcommand{\coauth}{}
  \newcommand{\varsinpaper}{}
  \newcommand{\recordinfo}{}
  \newcommand{\softcite}{}
  \newcommand{\prevchange}{}
  \newcommand{\paperfinal}{}
  \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}

  %% Don't show the happy scientist or the existing containers box.
  \let\paperinit\undefined
  \let\allopacity\undefined
  \let\paperfinal\undefined
  \let\containers\undefined





  \begin{frame}{Science is a tricky business}
    \begin{center}
      \includegraphics[width=0.9\linewidth]{img/nature-cartoon.jpg}
    \end{center}

    \vspace{-0.3cm}\hfill
    {\tiny Image from nature.com
      (``\href{https://www.nature.com/articles/d41586-017-07522-z}{Five
        ways to fix statistics}'', Nov 2017)}\hspace{7mm}

    \vspace{-1mm}
    \begin{tcolorbox}[boxsep=0pt,left=1mm,right=1mm,top=1mm,bottom=1mm]
      \small Data analysis [...] is a \alert{human
        behavior}. Researchers who hunt hard enough will turn up a
      result that fits statistical criteria, but their
      \alert{discovery} will probably be a \alert{false positive}.

      \hfill Five ways to fix statistics, Nature, 551, Nov 2017.
    \end{tcolorbox}
  \end{frame}





  \begin{frame}{Founding criteria}
    \begin{tcolorbox}[title=Basic/simple principle:]
      \centering Science is defined by its METHOD, \alert{not} its
      result.
    \end{tcolorbox}

    \ifdefined\longformat\pause\fi
    \begin{itemize}
    \item \textbf{Complete/self-contained:}
      \begin{itemize}
      \item \alert{Only dependency} should be \alert{POSIX} tools \textcolor{gray}{(discards Conda or Jupyter which need Python)}.
      \item Must \alert{not require root} permissions \textcolor{gray}{(discards tools like Docker or Nix/Guix)}.
      \item Should be \alert{non-interactive} or runnable in batch (user interaction is an incompleteness).
      \item Should be usable \alert{without internet} connection.
      \end{itemize}

    \ifdefined\longformat\pause\fi
    \item \textbf{Modularity:} Parts of the project should be \alert{re-usable} in other projects.
    \ifdefined\longformat\pause\fi
    \item \textbf{Plain text:} Project's source should be in \alert{plain-text} \textcolor{gray}{(binary formats need special software)}
      \begin{itemize}
      \item This includes high-level analysis.
      \item It is easily publishable (very low volume of $\times100$KB), archivable, and parse-able.
      \item \alert{Version control} (e.g., with Git) can track project's history.
      \end{itemize}
    \ifdefined\longformat\pause\fi
    \item \textbf{Minimal complexity:} Occum’s rasor: “Never posit pluralities without necessity”.
      \begin{itemize}
      \item Avoiding the \alert{fashionable} tool of the day: tomorrow another tool will take its place!
      \item Easier \alert{learning curve}, also doesn't create a \alert{generational gap}.
      \item Is \alert{compatible} and \alert{extensible}.
      \end{itemize}
    \ifdefined\longformat\pause\fi
    \item \textbf{Verifable inputs and outputs:} Inputs and Outputs must be \alert{automatically verified}.
    \ifdefined\longformat\pause\fi
    \item \textbf{Free and open source software:} \alert{Free software} is essential: non-free software is not configurable, not distributable, and dependent on non-free provider (which may discontinue it in N years).
    \end{itemize}
  \end{frame}



  \newcommand{\focusonpackages}{}
  \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
  \let\focusonpackages\undefined




  \ifdefined\longformat
  \begin{frame}{Predefined/exact software tools}
    \small
    \begin{columns}
      \column{10cm}
      \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt,
                        top=1pt, bottom=1pt, title=Reproducibility \&
                        software]
        \footnotesize Reproducing the environment (specific
        \alert{software versions}, \alert{build instructions} and
        \alert{dependencies}) is also critically important for
        reproducibility.
      \end{tcolorbox}

      \vspace{2cm}

      \begin{itemize}
        \setlength\itemsep{0.6cm}
      \item \emph{Containers} or \emph{Virtual Machines} are a
        \alert{binary black box}.

      \item Maneage \alert{installs fixed versions} of all
        necessary research software and their dependencies.

      \item Installs similar environment on \alert{GNU/Linux}, or
        \alert{macOS} systems.

      \item Works very much like a package manager (e.g.,
        \alert{\texttt{apt}} or \alert{\texttt{brew}}).
      \end{itemize}

      \column{5cm}
      \includegraphics[width=\linewidth]{img/version.png}
    \end{columns}
  \end{frame}





  \begin{frame}{Predefined/exact software tools}
    \small
    \begin{columns}
      \column{10cm}
      \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt,
                        top=1pt, bottom=1pt, title=Reproducibility \&
                        software]
        \footnotesize Reproducing the environment (specific
        \alert{software versions}, \alert{build instructions} and
        \alert{dependencies}) is also critically important for
        reproducibility.
      \end{tcolorbox}

      \vspace{2cm}

      \begin{itemize}
        \setlength\itemsep{0.6cm}
      \item \emph{Containers} or \emph{Virtual Machines} are a
        \alert{binary black box}.

      \item Maneage \alert{installs fixed versions} of all
        necessary research software and their dependencies.

      \item Installs similar environment on \alert{GNU/Linux}, or
        \alert{macOS} systems.

      \item Works very much like a package manager (e.g.,
        \alert{\texttt{apt}} or \alert{\texttt{brew}}).
      \end{itemize}

      \column{5cm}
      \includegraphics[width=\linewidth]{img/version-highlighted.png}
    \end{columns}
  \end{frame}
  \fi




  \begin{frame}{Example: Matplotlib (a Python visualization library) build dependencies}
    \Wider[5em]{
      %\vspace{5mm}
      \begin{center}
        \includegraphics[width=0.9\linewidth]{img/matplotlib.png}
      \end{center}

      \vspace{3mm}\tiny From ``Attributing and Referencing (Research)
      Software: Best Practices and Outlook from Inria'' (Alliez et
      al. 2020, CiSE, DOI:\textcolor{blue}{\href{https://doi.org/10.1109/MCSE.2019.2949413}{10.1109/MCSE.2019.2949413}}).
    }
  \end{frame}







  \begin{frame}{Advantages of this build system}
    \begin{columns}
      \column{11cm}
      \begin{itemize}
        \setlength\itemsep{0.7cm}
      \item Project runs in fixed/controlled environment: custom build
        of \alert{Bash}, \alert{Make}, GNU Coreutils
        (\alert{\texttt{ls}}, \alert{\texttt{cp}},
        \alert{\texttt{mkdir}} and etc), \alert{AWK}, or \alert{SED},
        \alert{\LaTeX}, etc.
      \item No need for \alert{root}/administrator \alert{permissions}
        (on servers or super computers).
      \item Whole system is built \alert{automatically} on any
        Unix-like operating system \\(less 2 hours).
      \item Dependencies of different projects will \alert{not conflict}.
      \item Everything in \alert{plain text} (human \& computer
        readable/archivable).
      \end{itemize}
      \column{4cm}
      \includegraphics[width=\linewidth]{img/unchained.jpg}\\
      \tiny \url{https://natemowry2.wordpress.com}
    \end{columns}
  \end{frame}




  \ifdefined\longformat
  \begin{frame}{Software citation automatically generated in paper (including Astropy)}
    \centering
    \includegraphics[width=0.8\linewidth]{img/software-cite.jpg}
  \end{frame}
  \fi
  \begin{frame}{Software citation automatically generated in paper (including Astropy)}
    \centering
    \includegraphics[width=0.8\linewidth]{img/software-cite-highlighted.jpg}
  \end{frame}

  %% Hardware/data
  \newcommand{\focusonhardware}{}
  \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
  \let\focusonhardware\undefined

  \ifdefined\longformat
  \begin{frame}{Input data source and integrity is documented and checked}
    \small
    \begin{columns}
      \column{10cm}
      Stored information about each input file:
      \begin{itemize}
      \item \alert{PID} (where available).
      \item Download \alert{URL}.
      \item \alert{MD5}-sum to check integrity.
      \end{itemize}

      \vspace{0.75cm} All inputs are \alert{downloaded} from the given
      PID/URL when necessary\\(during the analysis).

      \vspace{0.75cm} MD5-sums are \alert{checked} to make sure the
      download was done properly or the file is the same (hasn't
      changed on the server/source).

      \vspace{0.75cm}Example from the reproducible paper \textcolor{blue}{\href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230}}.\\
      This paper needs three input files (two images, one catalog).

      \column{5cm}
      \includegraphics[width=\linewidth]{img/inputs.png}
    \end{columns}
  \end{frame}
  \fi

  \begin{frame}{Input data source and integrity is documented and checked}
    \small
    \begin{columns}
      \column{10cm}
      Stored information about each input file:
      \begin{itemize}
      \item \alert{PID} (where available).
      \item Download \alert{URL}.
      \item \alert{MD5}-sum to check integrity.
      \end{itemize}

      \vspace{0.75cm} All inputs are \alert{downloaded} from the given
      PID/URL when necessary\\(during the analysis).

      \vspace{0.75cm} MD5-sums are \alert{checked} to make sure the
      download was done properly or the file is the same (hasn't
      changed on the server/source).

      \vspace{0.75cm}Example from the reproducible paper \textcolor{blue}{\href{https://arxiv.org/abs/1909.11230}{arXiv:1909.11230}}.\\
      This paper needs three input files (two images, one catalog).

      \column{5cm}
      \includegraphics[width=\linewidth]{img/inputs-highlighted.png}
    \end{columns}
  \end{frame}











  %% Analysis
  \newcommand{\focusonrun}{}
  \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
  \let\focusonrun\undefined




  \ifdefined\longformat
  \begin{frame}{Reproducible science: Maneage is managed through a Makefile}
    \small
    \begin{columns}
      \column{10cm}

      All steps (downloading and analysis) are managed by Makefiles\\
      (example from
      \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}):

      \vspace{5mm}
      \begin{itemize}
        \setlength\itemsep{0.7cm}
      \item Unlike a script which always starts from the top, a
        Makefile \alert{starts from the end} and steps that don't
        change will be left untouched (not remade).
      \item A single \emph{rule} can \alert{manage any number of
        files}.
      \item Make can identify independent steps internally and do them
        in \alert{parallel}.
      \item Make was \alert{designed for complex projects} with
        thousands of files (all major Unix-like components), so it is
        highly evolved and efficient.
      \item Make is a very \alert{simple} and \alert{small} language,
        thus easy to learn with great and free documentation (for
        example
        \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU
            Make's manual}}).
      \end{itemize}

      \column{5cm}
      \includegraphics[width=\linewidth]{img/reproducible-makefile.png}
    \end{columns}
  \end{frame}
  \begin{frame}{Reproducible science: Maneage is managed through a Makefile}
    \small
    \begin{columns}
      \column{10cm}

      All steps (downloading and analysis) are managed by Makefiles\\
      (example from
      \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}):

      \vspace{5mm}
      \begin{itemize}
        \setlength\itemsep{0.7cm}
      \item Unlike a script which always starts from the top, a
        Makefile \alert{starts from the end} and steps that don't
        change will be left untouched (not remade).
      \item A single \emph{rule} can \alert{manage any number of
        files}.
      \item Make can identify independent steps internally and do them
        in \alert{parallel}.
      \item Make was \alert{designed for complex projects} with
        thousands of files (all major Unix-like components), so it is
        highly evolved and efficient.
      \item Make is a very \alert{simple} and \alert{small} language,
        thus easy to learn with great and free documentation (for
        example
        \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU
            Make's manual}}).
      \end{itemize}

      \column{5cm}
      \includegraphics[width=\linewidth]{img/reproducible-makefile-highlighted-1.png}
    \end{columns}
  \end{frame}
  \fi
  \begin{frame}{Reproducible science: Maneage is managed through a Makefile}
    \small
    \begin{columns}
      \column{10cm}

      All steps (downloading and analysis) are managed by Makefiles\\
      (example from
      \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.1164774}{zenodo.1164774}}):

      \vspace{5mm}
      \begin{itemize}
        \setlength\itemsep{0.7cm}
      \item Unlike a script which always starts from the top, a
        Makefile \alert{starts from the end} and steps that don't
        change will be left untouched (not remade).
      \item A single \emph{rule} can \alert{manage any number of
        files}.
      \item Make can identify independent steps internally and do them
        in \alert{parallel}.
      \item Make was \alert{designed for complex projects} with
        thousands of files (all major Unix-like components), so it is
        highly evolved and efficient.
      \item Make is a very \alert{simple} and \alert{small} language,
        thus easy to learn with great and free documentation (for
        example
        \textcolor{blue}{\href{https://www.gnu.org/software/make/manual/}{GNU
            Make's manual}}).
      \end{itemize}

      \column{5cm}
      \includegraphics[width=\linewidth]{img/reproducible-makefile-highlighted-2.png}
    \end{columns}
  \end{frame}

























  \newcommand{\focusonpaper}{}
  \begin{frame}{General outline of a project (after data collection)} \include{tex/project-graph} \end{frame}
  \let\focusonpaper\undefined

  \ifdefined\longformat
  \begin{frame}{Values in final report/paper}
    All analysis \alert{results} (numbers, plots, tables) written in
    paper's PDF as \alert{\LaTeX{} macros}. They are thus
    \alert{updated automatically} on any change.\\ Shown here is a
    portion of the \textsf{NoiseChisel} paper and its \LaTeX{} source
    (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}).

    \vspace{0.4cm}
    \includegraphics[width=\linewidth]{img/reproducible-latex.png}
  \end{frame}
  \fi

  \begin{frame}{Values in final report/paper}
    All analysis \alert{results} (numbers, plots, tables) written in
    paper's PDF as \alert{\LaTeX{} macros}. They are thus
    \alert{updated automatically} on any change.\\ Shown here is a
    portion of the \textsf{NoiseChisel} paper and its \LaTeX{} source
    (\textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}}).

    \vspace{0.4cm}
    \includegraphics[width=\linewidth]{img/reproducible-latex-highlighted.png}
  \end{frame}




  \ifdefined\longformat
  \begin{frame}{Analysis step results/values concatenated into a single file.}
    All \LaTeX{} macros come from a \alert{single file}.
    \begin{center}
      \includegraphics[width=0.6\linewidth]{img/reproducible-macros.png}
    \end{center}
  \end{frame}
  \fi
  \begin{frame}{Analysis step results/values concatenated into a single file.}
    All \LaTeX{} macros come from a \alert{single file}.
    \begin{center}
      \includegraphics[width=0.6\linewidth]{img/reproducible-macros-highlighted.png}
    \end{center}
  \end{frame}






  \ifdefined\longformat
  \begin{frame}{Analysis results stored as \LaTeX{} macros}
    The analysis scripts write/update the \LaTeX{} macro values
    automatically.
    \begin{center}
      \includegraphics[width=0.6\linewidth]{img/reproducible-write-macro.png}
    \end{center}
  \end{frame}
  \fi
  \begin{frame}{Analysis results stored as \LaTeX{} macros}
    The analysis scripts write/update the \LaTeX{} macro values
    automatically.
    \begin{center}
      \includegraphics[width=0.6\linewidth]{img/reproducible-write-macro-highlight.png}
    \end{center}
  \end{frame}



  %% Make demo.
  \begin{frame}{Let's look at the data lineage to replicate Figure 1C (green/tool) of Menke+2020 \\(DOI:\href{https://doi.org/10.1101/2020.01.15.908111}{10.1101/2020.01.15.908111}), as done in \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}} for a demo.}
    \begin{columns}
      \column{0.55\linewidth}
      \textcolor{blue}{ORIGINAL PLOT}

      The Green plot shows the fraction of papers mentioning software tools from 1997 to 2019.
      \column{0.45\linewidth}
      \includegraphics[width=\linewidth]{img/tools-per-year-orig.jpg}
    \end{columns}

    \rule{\textwidth}{1pt}

    \begin{columns}
      \column{0.4\linewidth}
      \textcolor{green!70!black}{OUR enhanced REPLICATION}

      The green line is same as above but over their full historical range.

      Red histogram is the number of papers studied in each year
      \column{0.6\linewidth}
      \vspace{1cm}
      \includegraphics[width=\linewidth]{img/tools-per-year.pdf}
    \end{columns}
  \end{frame}


  \ifdefined\longformat
  \makedemoslide{img/data-lineage-1.pdf}
                {Makefiles (\texttt{\*.mk}) keep contextually separate parts of the project, all imported into \texttt{top-make.mk}}
  \makedemoslide{img/data-lineage-2.pdf}
                {The ultimate purpose of the project is to produce a paper/report (in PDF).}
  \makedemoslide{img/data-lineage-3.pdf}
                {The narrative description, typography and references are in \texttt{paper.tex} \& \texttt{references.tex}.}
  \makedemoslide{img/data-lineage-4.pdf}
                {Analysis outputs (blended into the PDF as \LaTeX{} macros) come from \texttt{project.tex}.}
  \makedemoslide{img/data-lineage-5.pdf}
                {But analysis outputs must first be \emph{verified} (with checksums) before entering the report/paper.}
  \makedemoslide{img/data-lineage-6.pdf}
                {Basic project info comes from \texttt{initialize.tex}.}
  \makedemoslide{img/data-lineage-7.pdf}
                {The paper includes some information about the plot.}
  \makedemoslide{img/data-lineage-8.pdf}
                {The final plotted data are calculated and stored in \texttt{tools-per-year.txt}.}
  \makedemoslide{img/data-lineage-9.pdf}
                {The plot's calculation is done on a formatted sub-set of the raw input data.}
  \makedemoslide{img/data-lineage-10.pdf}
                {The raw data that were downloaded are stored in XLSX format.}
  \makedemoslide{img/data-lineage-11.pdf}
                {The download URL \emph{and} a \alert{checksum to validate} the raw inputs, are stored in \texttt{INPUTS.conf}.}
  \makedemoslide{img/data-lineage-12.pdf}
                {We also need to report the URL in the paper...}
  \makedemoslide{img/data-lineage-13.pdf}
                {Some general info about the full dataset may also be reported.}
  \fi

  \ifdefined\longformat
  \makedemoslide{img/data-lineage-14.pdf}
                {We report the number of papers studied in a special year, desired year is stored in \texttt{.conf} file.}
  \else
  \makedemoslide{img/data-lineage-14.pdf}
                {All analysis steps cascade down to paper.pdf (URL and checksum of input in \texttt{INPUTS.conf}).}
  \fi

  \makedemoslide{img/data-lineage-15.pdf}
                {It is very easy to expand the project and add new analysis steps (this solution is scalable)}

























  \newcommand{\allopacity}{1}
  \begin{frame}{All questions have an answer now (in
        \alert{plain text}: human \& computer readable/archivable).}
    \include{tex/project-graph} \end{frame}
  \newcommand{\gitlogo}{}
  \begin{frame}{All questions have an answer now (in
        \alert{plain text}: so we can use Git to keep its history).}
    \include{tex/project-graph}
  \end{frame}




  \ifdefined\longformat
  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
  \newcommand{\tomorrow}{1}
  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
  \newcommand{\abstractify}{1}
  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
  \newcommand{\projinit}{}
  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
  \newcommand{\projwork}{}
  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
  \newcommand{\tempevolve}{}
  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
  \newcommand{\mergewithtemp}{}
  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
  \newcommand{\tofuture}{}
  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
  \newcommand{\githappy}{}
  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}
  \newcommand{\gitverified}{}
  \else
  \newcommand{\abstractify}{1}
  \newcommand{\projinit}{}
  \newcommand{\projwork}{}
  \newcommand{\tempevolve}{}
  \newcommand{\mergewithtemp}{}
  \newcommand{\tofuture}{}
  \newcommand{\githappy}{}
  \newcommand{\gitverified}{}
  \fi
  \begin{frame}{New projects branch from Maneage} \include{tex/git-branch} \end{frame}

  \ifdefined\longformat
  \begin{frame}{Two recent examples (publishing Git checksum in abstract)}
    \begin{columns}
      \column{0.5\linewidth}
      \centering
      \includegraphics[width=0.8\linewidth]{img/firstpage-190911230.png}
      \column{0.5\linewidth}
      \centering
      \includegraphics[width=0.8\linewidth]{img/firstpage-mnras491.png}
    \end{columns}
  \end{frame}
  \fi

  \begin{frame}{Two recent examples (publishing Git checksum in abstract)}
    \begin{columns}
      \column{0.5\linewidth}
      \centering
      \includegraphics[width=0.8\linewidth]{img/firstpage-190911230-highlighted.png}
      \column{0.5\linewidth}
      \centering
      \includegraphics[width=0.8\linewidth]{img/firstpage-mnras491-highlighted.png}
    \end{columns}
  \end{frame}





  \begin{frame}{Publication of the project}

    A reproducible project using Maneage will have the following
    (\alert{plain text}) components:
    \begin{itemize}
    \item Makefiles.
    \item \LaTeX{} source files.
    \item Configuration files for software used in analysis.
    \item Scripts/programming files (e.g., Python, Shell, AWK, C).
    \end{itemize}
    The \alert{volume} of the project's source will thus be
    \alert{negligible} compared to a single figure in a paper
    (usually $\sim100$ kilo-bytes).

    \vspace{1cm} The project's pipeline (customized Maneage) can be
    \alert{published} in
    \begin{itemize}
    \item \alert{arXiv}: uploaded with the \LaTeX{} source to always
      stay with the paper \\(for example
      \textcolor{blue}{\small\href{https://arxiv.org/abs/1505.01664}{arXiv:1505.01664}} or \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}).
    \item \alert{Zenodo}: Along with all the input datasets (many
      Gigabytes) and software \\(for example
      \textcolor{blue}{\small\href{https://doi.org/10.5281/zenodo.3872248}{zenodo.3872248}}) and given a unique DOI.
    \end{itemize}
  \end{frame}





  \begin{frame}[t]{Executing a Maneaged project (for example \href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018})}
    \vspace{1cm}
    \texttt{\$ git clone https://gitlab.com/makhlaghi/maneage-paper{ }{ }{ }{ }\textcolor{gray}{\# Import the project.}}\\

    \ifdefined\longformat\pause\fi
    \vspace{1.5cm}
    \texttt{\$ ./project configure { }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# You will specify the build directory on your system,}}\\
    \texttt{{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# and it will build all software (about 1.5 hours).}}

    \ifdefined\longformat\pause\fi
    \vspace{1.5cm}
    \texttt{\$ ./project make { }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }\textcolor{gray}{\# Does all the analysis and makes final PDF.}}\\
  \end{frame}





    \begin{frame}{Future prospects...}
    \large Adoption of reproducibility by many researchers will enable
    the following:

    \vspace{1em}
    \begin{itemize}
      \setlength\itemsep{3mm}
    \item A repository for education/training \textcolor{gray}{(PhD
      students, or researchers in other fields)}.
    \item Easy \alert{verification}/\alert{understanding} of other
      research projects \textcolor{gray}{(when necessary)}.
    \item Trivially \alert{test} different steps of others' work
      \textcolor{gray}{(different configurations, software and etc)}.
    \item Science can progress \alert{incrementally}
      \textcolor{gray}{(shorter papers actually building on each
        other!)}.
    \item \alert{Extract meta-data} after the publication of a dataset
      \textcolor{gray}{(for future ontologies or vocabularies)}.
    \item Applying \alert{machine learning} on reproducible research
      projects will allow us to solve some Big Data Challenges:

      \vspace{1em}
      \begin{itemize}
        \setlength\itemsep{2mm}
      \item \emph{Extract the relevant parameters automatically}.
      \item \emph{Translate the science to enormous samples}.
      \item \emph{Believe the results when no one will have time to
        reproduce}.
      \item \emph{Have confidence in results derived using machine
        learning or AI}.
      \end{itemize}
    \end{itemize}
  \end{frame}





  \begin{frame}{Summary:}

    Maneage and its principles are described in \textcolor{blue}{\href{https://arxiv.org/abs/2006.03018}{arXiv:2006.03018}}.
    It is a customizable template that will do the following steps/instructions (all in simple plain text files).
    \begin{itemize}
      \item \alert{Automatically downloads} the necessary
        \emph{software} and \emph{data}.
      \item \alert{Builds} the software in a \alert{closed
        environment}.
      \item Runs the software on data to \alert{generate} the final
        \alert{research results}.
      \item Modification of part of the analysis will only
        result in re-doing that part, not the whole project.
      \item Using LaTeX macros, paper's figures, tables and numbers
        will be \alert{Automatically updated} after a change in
        analysis. Allowing the scientist to focus on the scientific
        interpretation.
      \item The whole project is under \alert{version control} (Git)
        to allow easy reversion to a previous state. This
        \alert{encourages tests/experimentation} in the analysis.
      \item The \alert{Git commit hash} of the project source, is
        \alert{printed} in the published paper and \alert{saved on
          output} data products.  Ensuring the
        integrity/reproducibility of the result.
      \item \colorbox{green!30!white}{These slides are available at
        \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro-short.pdf}}.}
      \item \colorbox{green!15!white}{Longer slides are available at
        \textcolor{blue}{\url{https://maneage.org/pdf/slides-intro.pdf}}.}
    \end{itemize}

    \begin{tcolorbox}[width=\linewidth, boxsep=1pt, left=1pt, right=1pt,
                      top=1pt, bottom=1pt]
      For a technical description of Maneage's implementation, as well
      as a checklist to customize it, and tips on good practices,
      please see this page:

    \textcolor{blue}{\footnotesize\url{https://gitlab.com/maneage/project/-/blob/maneage/README-hacking.md}}
    \end{tcolorbox}
  \end{frame}
\end{document}