From 48e61df320e7571e527049cd3eef5cf96b8cb491 Mon Sep 17 00:00:00 2001 From: Mohammad Akhlaghi Date: Mon, 23 Mar 2020 03:16:06 +0000 Subject: Analysis and configuration file sections complete With this commit a description of these two important parts have been added to the project, along with several figures showing various parts of the files that are discussed. I also done some other restructuring of the figures and files to make things fit better into the the description of the paper. --- paper.tex | 191 +++++++++++++++++++------ reproduce/analysis/config/INPUTS.conf | 15 ++ reproduce/analysis/config/INPUTS.mk | 15 -- reproduce/analysis/config/menke-demo-year.conf | 3 + reproduce/analysis/config/pdf-build.conf | 21 +++ reproduce/analysis/config/pdf-build.mk | 21 --- reproduce/analysis/config/verify-outputs.conf | 3 + reproduce/analysis/config/verify-outputs.mk | 3 - reproduce/analysis/make/analysis-1.mk | 89 ------------ reproduce/analysis/make/demo-plot.mk | 59 ++++++++ reproduce/analysis/make/format.mk | 82 +++++++++++ reproduce/analysis/make/top-make.mk | 5 +- tex/src/figure-data-lineage.tex | 46 +++--- tex/src/figure-download.tex | 8 -- tex/src/figure-file-architecture.tex | 2 +- tex/src/figure-inputconf.tex | 8 ++ tex/src/figure-mk20tab3.tex | 59 -------- tex/src/figure-src-demoplot.tex | 32 +++++ tex/src/figure-src-download.tex | 8 ++ tex/src/figure-src-format.tex | 59 ++++++++ tex/src/figure-src-topmake.tex | 24 ++++ tex/src/figure-tools-per-year.tex | 34 +++++ 22 files changed, 523 insertions(+), 264 deletions(-) create mode 100644 reproduce/analysis/config/INPUTS.conf delete mode 100644 reproduce/analysis/config/INPUTS.mk create mode 100644 reproduce/analysis/config/menke-demo-year.conf create mode 100644 reproduce/analysis/config/pdf-build.conf delete mode 100644 reproduce/analysis/config/pdf-build.mk create mode 100644 reproduce/analysis/config/verify-outputs.conf delete mode 100644 reproduce/analysis/config/verify-outputs.mk delete mode 100644 reproduce/analysis/make/analysis-1.mk create mode 100644 reproduce/analysis/make/demo-plot.mk create mode 100644 reproduce/analysis/make/format.mk delete mode 100644 tex/src/figure-download.tex create mode 100644 tex/src/figure-inputconf.tex delete mode 100644 tex/src/figure-mk20tab3.tex create mode 100644 tex/src/figure-src-demoplot.tex create mode 100644 tex/src/figure-src-download.tex create mode 100644 tex/src/figure-src-format.tex create mode 100644 tex/src/figure-src-topmake.tex create mode 100644 tex/src/figure-tools-per-year.tex diff --git a/paper.tex b/paper.tex index f6aeed3..6a1e73a 100644 --- a/paper.tex +++ b/paper.tex @@ -624,12 +624,12 @@ The latter is necessary for many web-based automatic paper generating systems li \vspace{-5mm} \caption{\label{fig:files} Directory and file structure in a hypothetical project using this solution. - Files are shown with small, green boxes that have a suffix in their names (for example \inlinecode{analysis-1.mk} or \inlinecode{param-2.conf}). + Files are shown with small, green boxes that have a suffix in their names (for example \inlinecode{format.mk} or \inlinecode{download.tex}). Directories (containing multiple files) are shown as large, brown boxes, where the name ends in a slash (\inlinecode{/}). Directories with dashed lines and no files (just a description) are symbolic links that are created after building the project, pointing to commonly needed built directories. Symbolic links and their contents are not considered part of the source and are not under version control. Files and directories are shown within their parent directory. - For example the full address of \inlinecode{analysis-1.mk} from the top project directory is \inlinecode{reproduce/analysis/make/analysis-1.mk}. + For example the full address of \inlinecode{format.mk} from the top project directory is \inlinecode{reproduce/analysis/make/format.mk}. } \end{figure} @@ -908,6 +908,8 @@ We'll follow Make's paradigm (see Section \ref{sec:usingmake}) of starting form Blue files/boxes are output files of various steps in the build-directory, located within the Makefile (\inlinecode{*.mk}) that generates them. For example \inlinecode{paper.pdf} depends on \inlinecode{project.tex} (in the build directory and generated automatically) and \inlinecode{paper.tex} (in the source directory and written by hand). In turn, \inlinecode{project.tex} depends on all the \inlinecode{*.tex} files at the bottom of the Makefiles above it. + The solid arrows and built boxes with full opacity are actually described in the context of a demonstration project in this paper. + The dashed arrows and lower opacity built boxes, just shows how adding more elements to the lineage is also easily possible, making this a scalable tool. } \end{figure} @@ -961,14 +963,14 @@ In other words, it formalizes the connections of this scholarship with previous \label{sec:valuesintext} Figures, plots, tables and narrative aren't the only analysis output that goes into the paper. In many cases, quantitative values from the analysis are also blended into the sentences of the report's narration. -For example this sentence in the abstract of \citet{akhlaghi19}: ``... the outer wings of M51 down to S/N of 0.25 ...''. +For example this sentence in the abstract of \citet{akhlaghi19}: ``... detect the outer wings of M51 down to S/N of 0.25 ...''. The reported signal-to-noise ratio (S/N) value ``0.25'' depends on the analysis and is an output of the analysis just like paper's figures and plots. Manually typing the number in the \LaTeX{} source is prone to very important bugs: the author may forget to check it after a change in an analysis (e.g., using a newer version of the software, or changing an analysis parameter for another part of the paper). Given the evolution of a scientific projects, this type of human error is very hard to avoid when such values are manually written. Such values must also be automatically generated. To automatically generate and blend them in the text, we use \LaTeX{} macros. -In the quote above, the \LaTeX{} source\footnote{\citet{akhlaghi19} uses this templat to be reproducible, so its LaTeX source is available in multiple ways: 1) direct download from arXiv:\href{https://arxiv.org/abs/1909.11230}{1909.11230}, by clicking on ``other formats'', or 2) the Git or \href{https://doi.org/10.5281/zenodo.3408481}{zenodo.3408481} links is also available on arXiv.} looks like this: ``\inlinecode{\small the outer wings of M51 down to S/N of \$\textbackslash{}demo\-sf\-optimized\-sn\$}''. +In the quote above, the \LaTeX{} source\footnote{\citet{akhlaghi19} uses this templat to be reproducible, so its LaTeX source is available in multiple ways: 1) direct download from arXiv:\href{https://arxiv.org/abs/1909.11230}{1909.11230}, by clicking on ``other formats'', or 2) the Git or \href{https://doi.org/10.5281/zenodo.3408481}{zenodo.3408481} links is also available on arXiv.} looks like this: ``\inlinecode{\small detect the outer wings of M51 down to S/N of \$\textbackslash{}demo\-sf\-optimized\-sn\$}''. The \LaTeX{} macro ``\inlinecode{\small\textbackslash{}demosfoptimizedsn}'' is automatically calculated and recorded during in the project and expands to the value ``\inlinecode{0.25}''. The automatically generated file \inlinecode{project.tex} stores all such inline output macros. Furthermore, Figure \ref{fig:datalineage} shows that it is a prerequisite of \inlinecode{paper.pdf} (as well as the manually written \LaTeX{} sources that are shown in green). @@ -977,14 +979,14 @@ Therefore \inlinecode{paper.pdf} will not be built until this file is ready and However, managing all the necessary \LaTeX{} macros for a full project in one file is against the modularity principle and can be frustrating and buggy. To address this problem, all subMakefiles \emph{must} contain a fixed target with the same base-name, but with a \inlinecode{.tex} suffix. For example in Figure \ref{fig:datalineage}, assume \inlinecode{out-1b.dat} is a table and the mean of its third column must be reported in the paper. -Therefore in \inlinecode{analysis1.mk}, a prerequisite of \inlinecode{analysis1.tex} is \inlinecode{out-1b.dat} (as shown by the arrow in Figure \ref{fig:datalineage}). -The recipe of this rule will calculate the mean of the column and put it in the \LaTeX{} macro which is written in \inlinecode{analysis1.tex}. -In a similar way, any other reported calculation from \inlinecode{analysis1.mk} is stored as a \LaTeX{} macro in \inlinecode{analysis1.tex}. +Therefore in \inlinecode{format.mk}, a prerequisite of \inlinecode{format.tex} is \inlinecode{out-1b.dat} (as shown by the arrow in Figure \ref{fig:datalineage}). +The recipe of this rule will calculate the mean of the column and put it in the \LaTeX{} macro which is written in \inlinecode{format.tex}. +In a similar way, any other reported calculation from \inlinecode{format.mk} is stored as a \LaTeX{} macro in \inlinecode{format.tex}. These \LaTeX{} macro files thus form the core skeleton of the project: as shown in Figure \ref{fig:datalineage}, the outward arrows of all built files of any subMakefile ultimately leads to one of these \LaTeX{} macro files. Note that \emph{built} files in a subMakefile don't have to be a prerequisite of its \inlinecode{.tex} file. They may point to another Makefile's \LaTeX{} macro file. -For example even though \inlinecode{input1.dat} is a target in \inlinecode{download.mk}, it isn't a prerequisite of \inlinecode{download.tex}, it is a prerequisite of \inlinecode{out-2a.dat} (a target in \inlinecode{analysis2.mk}). +For example even though \inlinecode{input1.dat} is a target in \inlinecode{download.mk}, it isn't a prerequisite of \inlinecode{download.tex}, it is a prerequisite of \inlinecode{out-2a.dat} (a target in \inlinecode{demo-plot.mk}). The lineage ultimate ends in a \LaTeX{} macro file in \inlinecode{analysis3.tex}. @@ -1054,33 +1056,39 @@ Note that in such cases the servers often encode the creation date and version o Even when the actual data is identical, this metadata (which is in the same file) will differ based on the moment the query was done. Therefore a simple checksum of the whole downloaded file can't be used for validation in such scenarios, see Section \ref{principle:verify}. +\begin{figure}[t] + \input{tex/src/figure-inputconf.tex} + \vspace{-3mm} + \caption{\label{fig:inputconf} Contents of the \inlinecode{INPUTS.conf} file for the demonstration dataset of \citet{menke20}. + This file contains the basic, or minimal, metadata for retrieving the required dataset(s) of a project: it can become arbitrarily long. + Here, \inlinecode{M20DATA} contains the name of this dataset within this project. + \inlinecode{MK20MD5} contains the MD5 checksum of the dataset, in order to check the validity and integrity of the dataset before usage. + \inlinecode{MK20SIZE} contains the size of the dataset in human readable format. + \inlinecode{MK20URL} is the URL which the dataset is automatically downloaded from (only when its not already present on the host). + Note that the original URL (footnote \ref{footnote:dataurl}) was too long to display properly here. + } +\end{figure} + Each external dataset has some basic information, including its expected name on the local system (for offline access), the necessary checksum to validate it (either the whole file or just its main ``data''), and its URL/PID. In this template, such information regarding a project's input dataset(s) is in the \inlinecode{INPUTS.conf} file. See Figures \ref{fig:files} \& \ref{fig:datalineage} for the position of \inlinecode{INPUTS.conf} in the project's file structure and data lineage respectively. -For demonstration, in this paper, we are using the datasets of \citet{menke20} which are stored in one \inlinecode{.xlsx} file on bioXriv. -In \inlinecode{INPUTS.conf}, the example lines below show the necessary information as Make variables for this dataset. -Just note the the full URL was too large to show in this demonstration\footnote{\label{footnote:dataurl}This is the full URL: \url{\menketwentyurl}. Note that in the \LaTeX{} source, this URL is just a macro that was created in \inlinecode{download.mk}, and directly comes from \inlinecode{INPUTS.mk}, it is not hand-written.}. - -\begin{lstlisting}[language=bash] - MK20DATA = menke20.xlsx - MK20MD5 = 8e4eee64791f351fec58680126d558a0 - MK20SIZE = 1.9MB - MK20URL = https://the.full.url/is/too/large/for/here/media-1.xlsx -\end{lstlisting} +For demonstration, in this paper, we are using the datasets of \citet{menke20} which are stored in one \inlinecode{.xlsx} file on bioXriv\footnote{\label{footnote:dataurl}Full data URL: \url{\menketwentyurl}}. +Figure \ref{fig:inputconf} shows the corresponding \inlinecode{INPUTS.conf}, show the necessary information as Make variables for this dataset. \begin{figure}[t] - \input{tex/src/figure-download.tex} + \input{tex/src/figure-src-download.tex} \vspace{-3mm} - \caption{\label{fig:download} Simplified Make rule, showing how the downloaded data URL is written into this paper (Footnote \ref{footnote:dataurl}). - In Make, lines starting with a \inlinecode{\#} are ignored (thus used for comments, like first line here). - The \emph{target} is placed before a colon (\inlinecode{:}) and its \emph{prerequisite(s)} is(are) after the colon (here, both can be seen in the second line). + \caption{\label{fig:download} Simplified Make rule, showing how the downloaded data URL is written into this paper (in Footnote \ref{footnote:dataurl}). + In Make, lines starting with a \inlinecode{\#} are ignored (thus used for human-readable comments, like the red line shown here). + The \emph{target} is placed before a colon (\inlinecode{:}) and its \emph{prerequisite(s)} is(are) after the colon. + Here, both the target and prerequisite can be seen in the second line. The executable \emph{recipe} lines (shell commands to build the target from the prerequisite), start with a \inlinecode{TAB} (shown here with a light gray \inlinecode{\_\_\_TAB\_\_\_}). - A Make recipe can be viewed as a containerized shell script. + A Make recipe is an independent, or containerized, shell script. In the recipe, \inlinecode{\$@} is an \emph{automatic variable}, expanding to the target file's name. - The \inlinecode{MK20URL} variable is defined in \inlinecode{INPUTS.conf} and directly used to download the input dataset. - The same URL is then passed to this paper through the definition of the \inlinecode{\textbackslash{}menketwentyurl} \LaTeX{} variable that is written in \inlinecode{\$(mtexdir)/download.tex}. + For \inlinecode{MK20URL}, see Figure \ref{fig:inputconf}. + The same URL is then passed to this paper through the definition of the \LaTeX{} variable \inlinecode{\textbackslash{}menketwentyurl} that is written in \inlinecode{\$(mtexdir)/download.tex}. Later, when the paper's PDF is being built, this \inlinecode{.tex} file is loaded into it. - \inlinecode{mtexdir} is the directory hosting all the \LaTeX{} macro files for various stages of the analysis, see Section \ref{sec:valuesintext}. + \inlinecode{\$(mtexdir)} is the directory hosting all the \LaTeX{} macro files for various stages of the analysis, see Section \ref{sec:valuesintext}. } \end{figure} @@ -1114,52 +1122,149 @@ The analysis subMakefile(s) are loaded into \inlinecode{top-make.mk} after the i However, the analysis phase involves much more complexity. If done without modularity in mind from the start, research project sources can become very long, thus becoming hard to modify, debug, improve or read. Maneage is therefore designed to encourage and facilitate splitting the analysis into multiple/modular subMakefiles. -For example in the data lineage graph of Figure \ref{fig:datalineage}, the analysis is broken into three subMakefiles: \inlinecode{analysis-1.mk}, \inlinecode{analysis-2.mk} and \inlinecode{analysis-3.mk}. +For example in the data lineage graph of Figure \ref{fig:datalineage}, the analysis is broken into three subMakefiles: \inlinecode{format.mk}, \inlinecode{demo-plot.mk} and \inlinecode{analysis3.mk}. -Theoretical discussion of this phase can be hard to follow, we will thus describe the contents of \inlinecode{analysis1\-.mk} (Figure \ref{fig:mk20tab3}) in a demo project on data from \citet{menke20}. +Theoretical discussion of this phase can be hard to follow, we will thus describe a demonstration project on data from \citet{menke20}. In Section \ref{sec:download}, the process of importing this dataset into the proejct was described. -The first issue is that \inlinecode{menke20.xlsx} must be converted to a simple plain-text table which is generically usable by simple tools (see principle of minimal complexity in Section \ref{principle:complexity}). +The first issue is that \inlinecode{menke20.xlsx} must be converted to a simple plain-text table which is generically usable by simple tools (see the principle of minimal complexity in Section \ref{principle:complexity}). For more on the problems with Microsoft Office and this format, see Section \ref{sec:lowlevelanalysis}. -In \inlinecode{analysis1.mk} (Figure \ref{fig:mk20tab3}), we thus convert it to a simple white-space separated, plain-text table (\inlinecode{menke20-table-3.txt}) and do a basic calculation here to report. +In \inlinecode{format.mk} (Figure \ref{fig:formatsrc}), we thus convert it to a simple white-space separated, plain-text table (\inlinecode{menke20-table-3.txt}) and do a basic calculation on it. \begin{figure}[t] - \input{tex/src/figure-mk20tab3.tex} + \input{tex/src/figure-src-format.tex} \vspace{-3mm} - \caption{\label{fig:mk20tab3}Simplified contents of \inlinecode{analysis1.mk}. + \caption{\label{fig:formatsrc}Simplified contents of \inlinecode{format.mk}. Here, we want to convert the downloaded XLSX dataset (Office Open XML Workbook format) to a simple plain-text fixed-width-per-column table. For the position of this subMakefile in the full project's data lineage, see Figure \ref{fig:datalineage}. - In particular, here the arrows of that figure from \inlinecode{menke20.xlsx} to \inlinecode{menke20-table-3.txt} and from the latter to \inlinecode{analysis1.tex} are shown as the second and third Make rules. + In particular, here the arrows of that figure from \inlinecode{menke20.xlsx} to \inlinecode{menke20-table-3.txt} and from the latter to \inlinecode{format.tex} are shown as the second and third Make rules. See Figure \ref{fig:download} and Appendix \ref{appendix:make} for more on the Make notation and Section \ref{sec:analysis} for describing the steps. } \end{figure} -As shown in Figure \ref{fig:mk20tab3}, the first operation (or Make \emph{rule}) is to define a directory to keep the generated files. +As shown in Figure \ref{fig:formatsrc}, the first operation (or Make \emph{rule}) is to define a directory to keep the generated files. To keep the source and build-directories separate, we thus define \inlinecode{a1dir} under the build-directory (\inlinecode{BDIR}, see Section \ref{sec:localdirs}). We'll then define all outputs/targets to be under this directory. The second rule (which depends on the directory as a prerequisite), then converts the Microsoft Excel spreadsheet file to a simple plain-text format using the XLSX I/O program. But XLSX I/O only converts to CSV and we don't need all the columns here, so we further shorten and modify the table (re-order columns and multiply them) using the AWK program (which is available on any Unix-like operating system). In Figure \ref{fig:datalineage} on the example data lineage, this second rule is shown with the arrow from \inlinecode{menke20.xlsx} to \inlinecode{menke20-table-3.txt}. -Finally, as described in Section \ref{sec:valuesintext}, the last rule of a subMakefile should be a \LaTeX{} macro file (in Figure \ref{fig:mk20tab3}, this is the third rule). -Ending each analysis phase with a \LaTeX{} macro is natural in many reports. +Finally, as described in Section \ref{sec:valuesintext}, the last rule of a subMakefile should be a \LaTeX{} macro file (in Figure \ref{fig:formatsrc}, this is the third rule). +Ending each analysis phase with a \LaTeX{} macro is natural in many papers/reports. For example, here, once the dataset is ready, we want to give the reader a general view of the dataset size. -We thus need to report the number of subjects (papers/journals) studied in \citet{menke20}. +We thus need to report the number of subjects studied in \citet{menke20}. Therefore in the \LaTeX{} macro rule, we count them from the simplified table of the second rule. -In both cases, we write the sum as a temporary shell variable \inlinecode{v}, then write the value of \inlinecode{v} into \inlinecode{\textbackslash{}menkenumpapers} and \inlinecode{\textbackslash{}menkenumjournals} \LaTeX{} macros respectively. +In both cases, we write the sum as a temporary shell variable \inlinecode{v}, which is respectively written into these two \LaTeX{} macros \inlinecode{\textbackslash{}menkenumpapers} and \inlinecode{\textbackslash{}menkenumjournals}. In the built PDF paper, they expand to $\menkenumpapers$ (number of papers studied) and $\menkenumjournals$ (number of journals studied) respectively. -This rule is shown schematically in Figure \ref{fig:datalineage} with the arrow from \inlinecode{menke20-table-3.txt} to \inlinecode{analysis1.tex}. +This rule is shown schematically in Figure \ref{fig:datalineage} with the arrow from \inlinecode{menke20-table-3.txt} to \inlinecode{format.tex}. -Figure \ref{fig:mk20tab3} also shows one major advantage of Maneage: 1) The XLSX I/O software may not be present on many systems, or 2) the \inlinecode{FPAT} feature is only present in GNU AWK, not all implementations of AWK. -Therefore, while this Makefile can work when run alone, on many systems it won't complete successfuly because of these major portability problems. -However, because Maneage installs its own software, these problems don't exist: specific versions of XLSX I/O and GNU AWK are installed within the project. -Such portability problems are much more pronounced and relevant in higher-level science software. +To further demonstrate the concept, we'll reproduce (with some enhancements) Figure 1C of \citet{menke20} in Figure \ref{fig:toolsperyear}. +Figure \ref{fig:toolsperyear} also shows the number of papers that were studied each year in the same plot (unlike the original plot). +Its horizontal axis also shows the full range of the data (starting from $\menkefirstyear$) while the original Figure 1C in \citet{menke20} starts from 1997. +The reason \citet{menke20} decided to avoid earlier years was probably the small number of papers before 1997. +For example in \menkenumpapersdemoyear, they had only studied \menkenumpapersdemocount{} papers. +Note that both the numbers of this sentence, and the first year of data mentioned above, are actually \LaTeX{} macros, see Figure \ref{fig:demoplotsrc}). +\begin{figure}[t] + \begin{center} + \includetikz{figure-tools-per-year} + \end{center} + \vspace{-5mm} + \caption{\label{fig:toolsperyear}Fraction of papers mentioning software tools (green line, left vertical axis) to total number of papers studied in that year (light red bars, right vertical axis in log-scale). + Data from \citet{menke20}. + The subMakefile archiving the executable lineage of figure's data is shown in Figure \ref{fig:demoplotsrc} and discussed in Section \ref{sec:analysis}. + } +\end{figure} +The operation of reproducing that figure is a contextually separate operation from the operations that were described above in \inlinecode{format.mk}. +Therfore we add a new subMakefile to the project called \inlinecode{demo-plot.mk}, which is shown in Figure \ref{fig:demoplotsrc}. +As before, in the first rule, we make the directory to host the data (\inlinecode{a2dir}). +However, unlike before, this directory is placed under \inlinecode{texdir} which is the directory hosting all \LaTeX{} related files. +This is because the plot of Figure \ref{fig:toolsperyear} is directly made within \LaTeX{}, using its PGFPlots package\footnote{PGFPLots package of \LaTeX: \url{https://ctan.org/pkg/pgfplots}. + \inlinecode{texdir} has some special features when using \LaTeX{}, see Section \ref{sec:buildingpaper}. + PGFPlots uses the same graphics engine that is building the paper, producing a highquality figure that blends nicely in the paper.}. +Note that this is just our personal choice, other methods of generating plots (for example with R, Gnuplot or Matplotlib) are also possible within this system, see Section \ref{sec:buildingpaper}. + +The plain-text table that is used to build Figure \ref{fig:toolsperyear} is defined as the variable \inlinecode{a2mk20f1c} of Figure \ref{fig:demoplotsrc} (just above the second rule). +As shown in the second rule, again we use GNU AWK to extract the necessary information from \inlinecode{mk20tab3} (which was built in \inlinecode{format.mk}). +\inlinecode{mk20tab3} is thus the \emph{prerequisite} of \inlinecode{a2mk20f1c} (along with \inlinecode{a2dir}). +In Figure \ref{fig:datalineage}, this lineage is shown as the arrow from \inlinecode{menke20-table-3.txt} (file name of \inlinecode{mk20tab3}) that points to \inlinecode{tools-per-year.txt} (file name of \inlinecode{a2mk20f1c}). + +As with all subMakefiles, \inlinecode{demo-plot.mk} finishes with the rule to build its \LaTeX{} macro file (\inlinecode{demo-plot.tex}) containing the values reported above. +But here, it doesn't just depend on \inlinecode{a2mk20f1c}, it also depends on the \inlinecode{menke-demo-year.conf} configuration file. +This is also visible in the data lineage (Figure \ref{fig:datalineage}): two arrows point to \inlinecode{demo-plot.tex}, one from a configuration file, and one from a built file. +Configuration files are discussed in more detain in Section \ref{sec:configfiles}. +\begin{figure}[t] + \input{tex/src/figure-src-demoplot.tex} + \vspace{-2mm} + \caption{\label{fig:demoplotsrc}Contents of \inlinecode{analysi2.mk} subMakefile used to generate the data for Figure \ref{fig:toolsperyear}. + } +\end{figure} +In a similar manner many more subMakefiles can be added in more complex analysis scenarios. +This is shown with the lower opacity files and dashed arrows of the data lineage in Figure \ref{fig:datalineage}. +Generally, the files created within one subMakefile don't necessarily have to be a prerequisite of its \LaTeX{} macro. +For example see \inlinecode{demo-out.dat} in Figure \ref{fig:datalineage}: it is managed in \inlinecode{demo-plot.mk}, however, it isn't a prerequisite of \inlinecode{demo-plot.tex}, it is a prerequisite of \inlinecode{out-3b.dat} (which is managed in \inlinecode{another-step.mk} and is a prerequisite of \inlinecode{another-step.tex}). +Hence ultimately, through another file, it's decendants conclude in a \LaTeX{} macro. +The high-level \inlinecode{top-make.mk} file is designed to simplify the addition of new subMakefiles for the authors, and reading the source for readers (see Section \ref{sec:highlevelanalysis}). +As mentioned before, this high-level Makefile just defines the ultimate target (\inlinecode{paper.pdf}, see Section \ref{sec:paperpdf}) and imports all the subMakefiles in the specific order. +For example Figure \ref{fig:topmake} shows this project's \inlinecode{top-make.mk}. +When descriptive names are chosen for the subMakefiles, a simple glance over the values to \inlinecode{makesrc} here provides a general understanding of the project without needing to get into the technical details. +\begin{figure}[t] + \input{tex/src/figure-src-topmake.tex} + \vspace{-3mm} + \caption{\label{fig:topmake} Important parts of High-level \inlinecode{top-make.mk}. + } +\end{figure} + + + +\subsubsection{Configuration files} +\label{sec:configfiles} + +The analysis subMakefiles discussed above in Section \ref{sec:analysis} should only contain the organization of an analysis, they should not contains any fixed numbers, settings or parameters. +Such elements should only be used as variables that are defined elsewhere. +In the data lineage plot of Figure \ref{fig:datalineage}, configuration files are shown as the sharp-edged, green \inlinecode{*.conf} files in the top row. + +The last recipe of Figure \ref{fig:demoplotsrc} is a good demonstration of their usage: in Section \ref{sec:analysis}, we reported the number of papers studied by \citet{menke20} in \menkenumpapersdemoyear. +However, note that in Figure \ref{fig:demoplotsrc}, the year's number is not written by hand in the subMakefile. +It is referenced through the \inlinecode{menke-year-demo} variable, which is defined in \inlinecode{menke-demo-year.conf}, that is a prerequisite of the \inlinecode{demo-plot.tex} rule. +This is also visible in the data lineage of Figure \ref{fig:demoplotsrc}. + +All the configuration files of a project are placed under the \inlinecode{reproduce/analysis/config} (see Figure \ref{fig:files}) subdirectory, and are loaded into \inlinecode{top-make.mk} before any of the subMakefiles, see Figure \ref{fig:topmake}. +The configuration files greatly simplify project management from multiple perspectives as listed below: + +\begin{itemize} +\item If an analysis parameter is used in multiple places within the project, simply changing the value in the configuration file will change it everywhere in the project. + This is cirtical in more complex projects and if not done like this can lead to significant human error. +\item Configuration files enable the logical separation between the low-level implementation and high-level running of a project. + For example after writing the project, the authors don't need to remember where the number/parameter was used, they can just modify the configuration file. + Other co-authors, or readers, of the project also benefit: they just need to know that there is a unified place for high-level project settings, parameters, or numbers without necessarily having to know the low-level implementation. +\item A configuration file will be a prerequisite to any rule that uses it's value. + If the configuration file is updated (the value/parameter is changed), Make will automatically detect the data lineage branch that is affected by it and re-execute only that branch, without any human interference. +\end{itemize} +This is a great leap compared the current, mostly manual, project management that many scientists employ. +Manual management is prone to serious human error factors: at the later phases of a project, scientists are least likely to experiment on their project's configurations. +However, the later phases of a project are precisely the times where the lower-level parts of the project are complete and the authors can look at the bigger picture. +This style of managing project parameters therefore produces a much more healthy scientific result where experimentation is very cheap during all phases of a project; before its publication (by the authors) and after it (by the authors and readers). + + + + + +\subsubsection{The validation} +\label{sec:thevalidation} + +\subsubsection{Building the paper} +\label{sec:buildingpaper} + +\begin{itemize} +\item Discuss the importance of putting the \LaTeX{} related files in \inlinecode{texdir}. Especially how \inlinecode{tex/build} points to it. +\item Discuss how easy it is to built graphics outside of \LaTeX{}. +\end{itemize} \section{Discussion} diff --git a/reproduce/analysis/config/INPUTS.conf b/reproduce/analysis/config/INPUTS.conf new file mode 100644 index 0000000..b1cf546 --- /dev/null +++ b/reproduce/analysis/config/INPUTS.conf @@ -0,0 +1,15 @@ +# Input files necessary for this project. +# +# This file is read by the configure script and running Makefiles. +# +# Copyright (C) 2018-2020 Mohammad Akhlaghi +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice and +# this notice are preserved. This file is offered as-is, without any +# warranty. + +MK20DATA = menke20.xlsx +MK20MD5 = 8e4eee64791f351fec58680126d558a0 +MK20SIZE = 1.9MB +MK20URL = https://www.biorxiv.org/content/biorxiv/early/2020/01/18/2020.01.15.908111/DC1/embed/media-1.xlsx diff --git a/reproduce/analysis/config/INPUTS.mk b/reproduce/analysis/config/INPUTS.mk deleted file mode 100644 index b1cf546..0000000 --- a/reproduce/analysis/config/INPUTS.mk +++ /dev/null @@ -1,15 +0,0 @@ -# Input files necessary for this project. -# -# This file is read by the configure script and running Makefiles. -# -# Copyright (C) 2018-2020 Mohammad Akhlaghi -# -# Copying and distribution of this file, with or without modification, are -# permitted in any medium without royalty provided the copyright notice and -# this notice are preserved. This file is offered as-is, without any -# warranty. - -MK20DATA = menke20.xlsx -MK20MD5 = 8e4eee64791f351fec58680126d558a0 -MK20SIZE = 1.9MB -MK20URL = https://www.biorxiv.org/content/biorxiv/early/2020/01/18/2020.01.15.908111/DC1/embed/media-1.xlsx diff --git a/reproduce/analysis/config/menke-demo-year.conf b/reproduce/analysis/config/menke-demo-year.conf new file mode 100644 index 0000000..429b220 --- /dev/null +++ b/reproduce/analysis/config/menke-demo-year.conf @@ -0,0 +1,3 @@ +# This is the demonstration year showing the number of papers studied +# before 1997. +menke-demo-year = 1996 diff --git a/reproduce/analysis/config/pdf-build.conf b/reproduce/analysis/config/pdf-build.conf new file mode 100644 index 0000000..e2d59cc --- /dev/null +++ b/reproduce/analysis/config/pdf-build.conf @@ -0,0 +1,21 @@ +# Make the final PDF? +# ------------------- +# +# During the project's early phases, it is usually not necessary to build +# the PDF file (which makes a lot of output lines on the command-line and +# can make it hard to find the commands and possible errors (and their +# outputs). Also, in some cases, only the produced results may be of +# interest and not the final PDF, so LaTeX (and its necessary packages) may +# not be installed. +# +# If this variable is given any string, a PDF will be made with +# LaTeX. Otherwise, a notice will just printed that for now, no PDF will be +# created. +# +# Copyright (C) 2018-2020 Mohammad Akhlaghi +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice and +# this notice are preserved. This file is offered as-is, without any +# warranty. +pdf-build-final = yes diff --git a/reproduce/analysis/config/pdf-build.mk b/reproduce/analysis/config/pdf-build.mk deleted file mode 100644 index e2d59cc..0000000 --- a/reproduce/analysis/config/pdf-build.mk +++ /dev/null @@ -1,21 +0,0 @@ -# Make the final PDF? -# ------------------- -# -# During the project's early phases, it is usually not necessary to build -# the PDF file (which makes a lot of output lines on the command-line and -# can make it hard to find the commands and possible errors (and their -# outputs). Also, in some cases, only the produced results may be of -# interest and not the final PDF, so LaTeX (and its necessary packages) may -# not be installed. -# -# If this variable is given any string, a PDF will be made with -# LaTeX. Otherwise, a notice will just printed that for now, no PDF will be -# created. -# -# Copyright (C) 2018-2020 Mohammad Akhlaghi -# -# Copying and distribution of this file, with or without modification, are -# permitted in any medium without royalty provided the copyright notice and -# this notice are preserved. This file is offered as-is, without any -# warranty. -pdf-build-final = yes diff --git a/reproduce/analysis/config/verify-outputs.conf b/reproduce/analysis/config/verify-outputs.conf new file mode 100644 index 0000000..e580e04 --- /dev/null +++ b/reproduce/analysis/config/verify-outputs.conf @@ -0,0 +1,3 @@ +# To disable verification of output datasets set this variable to yes + +verify-outputs = diff --git a/reproduce/analysis/config/verify-outputs.mk b/reproduce/analysis/config/verify-outputs.mk deleted file mode 100644 index e580e04..0000000 --- a/reproduce/analysis/config/verify-outputs.mk +++ /dev/null @@ -1,3 +0,0 @@ -# To disable verification of output datasets set this variable to yes - -verify-outputs = diff --git a/reproduce/analysis/make/analysis-1.mk b/reproduce/analysis/make/analysis-1.mk deleted file mode 100644 index f739306..0000000 --- a/reproduce/analysis/make/analysis-1.mk +++ /dev/null @@ -1,89 +0,0 @@ -# Use the data from Menke 2020 (DOI:10.1101/2020.01.15.908111) as a -# demonstration analysis for this paper. This is a relevant paper because -# it provides good statistics about the status of reproducibility in -# scientific publications. -# -# Copyright (C) 2020 Mohammad Akhlaghi -# -# This Makefile is free software: you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation, either version 3 of the License, or (at your -# option) any later version. -# -# This Makefile is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General -# Public License for more details. See . - - - - -# Save the "Table 3" spreadsheet from the downloaded `.xlsx' file into a -# simple plain-text file that is easy to use. -a1dir = $(BDIR)/analysis-1 -mk20tab3 = $(a1dir)/menke20-table-3.txt -$(a1dir):; mkdir $@ -$(mk20tab3): $(indir)/menke20.xlsx | $(a1dir) - - # Set a base-name for the table-3 data. - base=$(basename $(notdir $<))-table-3 - - # Unfortunately XLSX I/O only works when the input and output are - # in the directory it is running. So first, we need to switch to - # the input directory, run it, then put our desired output where we - # want and delete the extra files. - topdir=$$(pwd) - cd $(indir) - xlsxio_xlsx2csv $(notdir $<) - cp $(notdir $<)."Table 3 All by journal by year".csv $$base.csv - rm $(notdir $<).*.csv - cd $$topdir - - # Read the necessary information. Note that we are dealing with a - # CSV (comma-separated value) file. But when there are commas in a - # string, quotation signs are put around it. The `FPAT' values is - # fully described in the GNU AWK manual. In short, it ensures that - # if there is a comma in the middle of double-quotes, it doesn't - # count as a delimter. - echo "# Column 1: YEAR [counter, i16] Year of journal's publication." > $@.tmp - echo "# Column 2: NUM_PAPERS [counter, i16] Number of studied papers in that journal." >> $@.tmp - echo "# Column 3: NUM_PAPERS_WITH_TOOLS [counter, i16] Number of papers with an identified tool." >> $@.tmp - echo "# Column 4: NUM_ID_TOOLS [counter, i16] Number of software/tools that were identified." >> $@.tmp - echo "# Column 5: JOURNAL_NAME [string, str150] Name of journal." >> $@.tmp - awk 'NR>1{printf("%-10d%-10d%-10d%-10d %s\n", $$2, $$3, $$3*$$NF, $$(NF-1), $$1)}' \ - FPAT='([^,]+)|("[^"]+")' $(indir)/$$base.csv >> $@.tmp - - # Set the temporary file as the final target. This was done so if - # there is any possible crash in the steps above, this rule is - # re-run (its final target isn't rebuilt). - mv $@.tmp $@ - - - - -############################ -# Recreate Figure 1C of Menke+20. -############################ - -# awk '!/^#/{all[$1]+=$2; id[$1]+=$3} END{for(year in all){print year, id[year]/all[year]}}' menke20-table-3.txt.tmp -############################ - - - - - -# Main LaTeX macro file -$(mtexdir)/analysis-1.tex: $(mk20tab3) | $(mtexdir) - - # Count the total number of papers in their study. - v=$$(awk '!/^#/{c+=$$2} END{print c}' $(mk20tab3)) - echo "\newcommand{\menkenumpapers}{$$v}" > $@ - - # Count how many unique journals there were in the study. Note that - # the `31' comes because we put 10 characters for each numeric - # column and separated the last numeric column from the string - # column with a space. If the number of numeric columns change in - # the future, the `31' also has to change. - v=$$(awk 'BEGIN{FIELDWIDTHS="41 10000"} !/^#/{print $$2}' \ - $(mk20tab3) | uniq | wc -l) - echo "\newcommand{\menkenumjournals}{$$v}" >> $@ diff --git a/reproduce/analysis/make/demo-plot.mk b/reproduce/analysis/make/demo-plot.mk new file mode 100644 index 0000000..caf77af --- /dev/null +++ b/reproduce/analysis/make/demo-plot.mk @@ -0,0 +1,59 @@ +# Second step of analysis: +# Data for plot of number/fraction of tools per year. +# +# Copyright (C) 2020 Mohammad Akhlaghi +# +# This Makefile is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This Makefile is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. See . + + + + +# Directory to host outputs +# ------------------------- +a2dir = $(texdir)/tools-per-year +$(a2dir):; mkdir $@ + + + + + +# Table for Figure 1C of Menke+20 +# ------------------------------- +a2mk20f1c = $(a2dir)/tools-per-year.txt +$(a2mk20f1c): $(mk20tab3) | $(a2dir) + + # Remove the (possibly) produced figure that is created from this + # table: it is created by LaTeX's TiKZ package, and includes + # multiple files with a fixed prefix. + rm -f $(tikzdir)/figure-tools-per-year* + + # Find the maximum number of papers. + awk '!/^#/{all[$$1]+=$$2; id[$$1]+=$$3} \ + END{ for(year in all) \ + print year, 100*id[year]/all[year], all[year] \ + }' $< \ + > $@ + + + + + +# Final LaTeX macro +$(mtexdir)/demo-plot.tex: $(a2mk20f1c) $(pconfdir)/menke-demo-year.conf + + # Find the first year (first column of first row) of data. + v=$$(awk 'NR==1{print $$1}' $(a2mk20f1c)) + echo "\newcommand{\menkefirstyear}{$$v}" > $@ + + # Find the number of papers in 1996. + v=$$(awk '$$1==$(menke-demo-year){print $$3}' $(a2mk20f1c)) + echo "\newcommand{\menkenumpapersdemocount}{$$v}" >> $@ + echo "\newcommand{\menkenumpapersdemoyear}{$(menke-demo-year)}" >> $@ diff --git a/reproduce/analysis/make/format.mk b/reproduce/analysis/make/format.mk new file mode 100644 index 0000000..868c411 --- /dev/null +++ b/reproduce/analysis/make/format.mk @@ -0,0 +1,82 @@ +# First step of analysis: +# Prepare the data, return basic values. +# +# As a demonstration analysis to go with the paper, we use the data from +# Menke 2020 (DOI:10.1101/2020.01.15.908111). This is a relevant paper +# because it provides interesting statistics about tools and methods used +# in scientific papers. +# +# Copyright (C) 2020 Mohammad Akhlaghi +# +# This Makefile is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This Makefile is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. See . + + + + +# Save the "Table 3" spreadsheet from the downloaded `.xlsx' file into a +# simple plain-text file that is easy to use. +a1dir = $(BDIR)/analysis1 +mk20tab3 = $(a1dir)/menke20-table-3.txt +$(a1dir):; mkdir $@ +$(mk20tab3): $(indir)/menke20.xlsx | $(a1dir) + + # Set a base-name for the table-3 data. + base=$(basename $(notdir $<))-table-3 + + # Unfortunately XLSX I/O only works when the input and output are + # in the directory it is running. So first, we need to switch to + # the input directory, run it, then put our desired output where we + # want and delete the extra files. + topdir=$$(pwd) + cd $(indir) + xlsxio_xlsx2csv $(notdir $<) + cp $(notdir $<)."Table 3 All by journal by year".csv $$base.csv + rm $(notdir $<).*.csv + cd $$topdir + + # Read the necessary information. Note that we are dealing with a + # CSV (comma-separated value) file. But when there are commas in a + # string, quotation signs are put around it. The `FPAT' values is + # fully described in the GNU AWK manual. In short, it ensures that + # if there is a comma in the middle of double-quotes, it doesn't + # count as a delimter. + echo "# Column 1: YEAR [counter, i16] Year of journal's publication." > $@.tmp + echo "# Column 2: NUM_PAPERS [counter, i16] Number of studied papers in that journal." >> $@.tmp + echo "# Column 3: NUM_PAPERS_WITH_TOOLS [counter, i16] Number of papers with an identified tool." >> $@.tmp + echo "# Column 4: NUM_ID_TOOLS [counter, i16] Number of software/tools that were identified." >> $@.tmp + echo "# Column 5: JOURNAL_NAME [string, str150] Name of journal." >> $@.tmp + awk 'NR>1{printf("%-10d%-10d%-10d%-10d %s\n", $$2, $$3, $$3*$$NF, $$(NF-1), $$1)}' \ + FPAT='([^,]+)|("[^"]+")' $(indir)/$$base.csv >> $@.tmp + + # Set the temporary file as the final target. This was done so if + # there is any possible crash in the steps above, this rule is + # re-run (its final target isn't rebuilt). + mv $@.tmp $@ + + + + + +# Main LaTeX macro file +$(mtexdir)/format.tex: $(mk20tab3) + + # Count the total number of papers in their study. + v=$$(awk '!/^#/{c+=$$2} END{print c}' $(mk20tab3)) + echo "\newcommand{\menkenumpapers}{$$v}" > $@ + + # Count how many unique journals there were in the study. Note that + # the `31' comes because we put 10 characters for each numeric + # column and separated the last numeric column from the string + # column with a space. If the number of numeric columns change in + # the future, the `31' also has to change. + v=$$(awk 'BEGIN{FIELDWIDTHS="41 10000"} !/^#/{print $$2}' \ + $(mk20tab3) | uniq | wc -l) + echo "\newcommand{\menkenumjournals}{$$v}" >> $@ diff --git a/reproduce/analysis/make/top-make.mk b/reproduce/analysis/make/top-make.mk index 6dd322f..000c1fd 100644 --- a/reproduce/analysis/make/top-make.mk +++ b/reproduce/analysis/make/top-make.mk @@ -112,8 +112,9 @@ endif # wild-card like the configuration Makefiles). makesrc = initialize \ download \ + format \ + demo-plot \ verify \ - analysis-1 \ paper @@ -132,5 +133,5 @@ makesrc = initialize \ # But before that, we need to identify the phase for the Makefiles that are # run both in `./project prepare' and `./project make'. project-phase = make -include reproduce/analysis/config/*.mk +include reproduce/analysis/config/*.conf include $(foreach s,$(makesrc), reproduce/analysis/make/$(s).mk) diff --git a/tex/src/figure-data-lineage.tex b/tex/src/figure-data-lineage.tex index 010a0be..7379b2f 100644 --- a/tex/src/figure-data-lineage.tex +++ b/tex/src/figure-data-lineage.tex @@ -58,11 +58,11 @@ \node (downloadmk) [node-makefile, at={(-2.93cm,-1.3cm)}, label={[shift={(0,-5mm)}]\texttt{download.mk}}] {}; \node (analysis1mk) [node-makefile, at={(-0.13cm,-1.3cm)}, - label={[shift={(0,-5mm)}]\texttt{analysis1.mk}}] {}; + label={[shift={(0,-5mm)}]\texttt{format.mk}}] {}; \node (analysis2mk) [node-makefile, at={(2.67cm,-1.3cm)}, - label={[shift={(0,-5mm)}]\texttt{analysis2.mk}}] {}; - \node (analysis2mk) [node-makefile, at={(5.47cm,-1.3cm)}, - label={[shift={(0,-5mm)}]\texttt{analysis3.mk}}] {}; + label={[shift={(0,-5mm)}]\texttt{demo-plot.mk}}] {}; + \node (analysis3mk) [node-makefile, at={(5.47cm,-1.3cm)}, + label={[shift={(0,-5mm)}]\texttt{another-step.mk}}] {}; %% verify.mk \node [at={(-5.3cm,-2.8cm)}, @@ -149,7 +149,7 @@ %% analysis1.tex \ifdefined\analysisonetex - \node (a1tex) [node-terminal, at={(-0.13cm,-0.8cm)}] {analysis1.tex}; + \node (a1tex) [node-terminal, at={(-0.13cm,-0.8cm)}] {format.tex}; \draw [rounded corners, -] (a1tex) |- (initialize-south); \fi @@ -167,14 +167,17 @@ %% analysis2.tex \ifdefined\analysistwotex - \node (a2tex) [node-terminal, at={(2.67cm,-0.8cm)}] {analysis2.tex}; + \node (a2tex) [node-terminal, at={(2.67cm,-0.8cm)}] {demo-plot.tex}; \draw [rounded corners, -] (a2tex) |- (initialize-south); \fi %% out-2b.dat \ifdefined\outtwob - \node (out2b) [node-terminal, at={(2.67cm,0.3cm)}] {out-2b.dat}; + \node (menkedemoyear) [node-nonterminal, at={(2.67cm,4.6cm)}] {menke-demo-year.conf}; + \node (a2tex-west) [node-point, at={(1.27cm,-0.8cm)}] {}; + \node (out2b) [node-terminal, at={(2.67cm,0.3cm)}] {tools-per-year.txt}; \draw [->] (out2b) -- (a2tex); + \draw [->,rounded corners] (menkedemoyear.west) -| (a2tex-west) |- (a2tex); \fi %% out-2b dependencies @@ -184,39 +187,36 @@ %% analysis3.tex \ifdefined\analysisthreetex - \node (a3tex) [node-terminal, at={(5.47cm,-0.8cm)}] {analysis3.tex}; - \draw [rounded corners, -] (a3tex) |- (initialize-south); + \node [opacity=0.6] (a3tex) [node-terminal, at={(5.47cm,-0.8cm)}] {another-step.tex}; + \draw [opacity=0.6, rounded corners, -, dashed] (a3tex) |- (initialize-south); \fi %% Outputs of analysis3 \ifdefined\analysisthreeouts - \node (out3a) [node-terminal, at={(5.47cm,2.7cm)}] {out-3a.dat}; - \node (out3b) [node-terminal, at={(5.47cm,1.1cm)}] {out-3b.dat}; + \node [opacity=0.6] (out3a) [node-terminal, at={(5.47cm,2.7cm)}] {out-3a.dat}; + \node [opacity=0.6] (out3b) [node-terminal, at={(5.47cm,1.1cm)}] {out-3b.dat}; \node (a3tex-east) [node-point, at={(6.87cm,-0.8cm)}] {}; - \draw [->,rounded corners] (out3a.east) -| (a3tex-east) |- (a3tex); - \draw [->] (out3b) -- (a3tex); + \draw [opacity=0.6, ->,rounded corners, dashed] (out3a.east) -| (a3tex-east) |- (a3tex); + \draw [opacity=0.6, ->, dashed] (out3b) -- (a3tex); \fi %% out-2a.dat \ifdefined\outtwoa - \node (out2a) [node-terminal, at={(2.67cm,1.9cm)}] {out-2a.dat}; - \draw [->, rounded corners] (out2a.south) |- (out3b); + \node [opacity=0.6] (out2a) [node-terminal, at={(2.67cm,1.9cm)}] {demo-out.dat}; + \draw [opacity=0.6, ->, rounded corners, dashed] (out2a.south) |- (out3b); \fi %% Dependencies of out-2a \ifdefined\outtwoadep - \node (a2conf1) [node-nonterminal, at={(2.67cm,5.3cm)}] {param-2a.conf}; - \node (a2conf2) [node-nonterminal, at={(2.67cm,4.6cm)}] {param-2b.conf}; \node (out2a-west) [node-point, at={(1.27cm,1.9cm)}] {}; - \draw [->,rounded corners] (a2conf1.west) -| (out2a-west) |- (out2a); - \draw [->,rounded corners] (a2conf2.west) -| (out2a-west) |- (out2a); - %\draw [->] (input1) -- (out2a); + \draw [opacity=0.6, ->, dashed] (input2) -- (out2a); \fi %% Dependencies of out-3a \ifdefined\outthreeadep - \node (out3a-west) [node-point, at={(4.07cm,2.7cm)}] {}; - \node (a3conf1) [node-nonterminal, at={(5.47cm,4.6cm)}] {param-3.conf}; - \draw [rounded corners] (a3conf1.west) -| (out3a-west) |- (out3a); + \node [opacity=0.6] (out3a-west) [node-point, at={(4.07cm,2.7cm)}] {}; + \draw [opacity=0.6, ->,rounded corners, dashed] (input2) |- (out3a); + \node [opacity=0.6] (a3conf1) [node-nonterminal, at={(5.47cm,4.6cm)}] {param-3.conf}; + \draw [opacity=0.6, rounded corners, dashed] (a3conf1.west) -| (out3a-west) |- (out3a); \fi \end{tikzpicture} diff --git a/tex/src/figure-download.tex b/tex/src/figure-download.tex deleted file mode 100644 index b9da02f..0000000 --- a/tex/src/figure-download.tex +++ /dev/null @@ -1,8 +0,0 @@ -\begin{tcolorbox} - \footnotesize - \texttt{\mkcomment{Write download URL into the paper (through a LaTeX macro).}} - - \texttt{\mktarget{\$(mtexdir)/download.tex}: \$(\mkvar{indir})/menke20.xlsx} - - \texttt{\mktab{}\mkprog{echo} "\textbackslash{}newcommand{\textbackslash{}menketwentyurl}\{\mktarget{\$(MK20URL)}\}" > \$@} -\end{tcolorbox} diff --git a/tex/src/figure-file-architecture.tex b/tex/src/figure-file-architecture.tex index 8fb1a6d..1fc26c5 100644 --- a/tex/src/figure-file-architecture.tex +++ b/tex/src/figure-file-architecture.tex @@ -88,7 +88,7 @@ \node [node-nonterminal-thin, at={(2.95cm,0.8cm)}] {top-prepare.mk}; \node [node-nonterminal-thin, at={(2.95cm,0.3cm)}] {top-make.mk}; \node [node-nonterminal-thin, at={(2.95cm,-0.2cm)}] {initialize.mk}; - \node [node-nonterminal-thin, at={(2.95cm,-0.7cm)}] {analysis1.mk}; + \node [node-nonterminal-thin, at={(2.95cm,-0.7cm)}] {format.mk}; %% reproduce/analysis/bash/ \node [dirbox, at={(0.15cm,-1.3cm)}, minimum width=2.6cm, minimum height=1.1cm, diff --git a/tex/src/figure-inputconf.tex b/tex/src/figure-inputconf.tex new file mode 100644 index 0000000..f09bebd --- /dev/null +++ b/tex/src/figure-inputconf.tex @@ -0,0 +1,8 @@ +\begin{tcolorbox} + \footnotesize + \texttt{\mkvar{MK20DATA} = menke20.xlsx}\\ + \texttt{\mkvar{MK20MD5}{ } = 8e4eee64791f351fec58680126d558a0}\\ + \texttt{\mkvar{MK20SIZE} = 1.9MB}\\ + \texttt{\mkvar{MK20URL}{ } = https://the.full.url/is/too/large/for/here/media-1.xlsx}\\ + \vspace{-3mm} +\end{tcolorbox} diff --git a/tex/src/figure-mk20tab3.tex b/tex/src/figure-mk20tab3.tex deleted file mode 100644 index 3cc0cd3..0000000 --- a/tex/src/figure-mk20tab3.tex +++ /dev/null @@ -1,59 +0,0 @@ -\begin{tcolorbox} - \footnotesize - \texttt{\mkcomment{1ST MAKE RULE: build the directory hosting the converted table.}} - - \texttt{\mkvar{a1dir} = \$(\mkvar{BDIR})/analysis-1} - - \texttt{\mktarget{\$(a1dir)}:} - - \texttt{\mktab{}\mkprog{mkdir} \$@} - - \vspace{2em} - \texttt{\mkcomment{2ND MAKE RULE: Convert the XLSX table to a simple plain-text table.}} - - \texttt{\mkvar{mk20tab3} = \$(\mkvar{a1dir})/menke20-table-3.txt} - - \texttt{\mktarget{\$(mk20tab3)}: \$(\mkvar{indir})/menke20.xlsx | \$(\mkvar{a1dir})} - - \texttt{\recipecomment{Call XLSX I/O to convert all the spreadsheets into different CSV files.}} - - \texttt{\recipecomment{We only want the `table-3' spreadsheet, but XLSX I/O doesn't allow setting its}} - - \texttt{\recipecomment{output filename. For simplicity, let's assume its written in `table-3.csv'.}} - - \texttt{\mktab{}\mkprog{xlsxio\_xlsx2csv} \$<} - - \vspace{0.5em} - \texttt{\recipecomment{Use GNU AWK to keep the desired columns in space-separated, fixed-width format.}} - - \texttt{\recipecomment{With `FPAT' commas within double quotes are not counted as columns.}} - - \texttt{\mktab{}\mkprog{awk} 'NR>1\{printf("\%-10d\%-10d\%-10d \%s\textbackslash{}n", \$\$2, \$\$3, \$\$(NF-1)*\$\$NF, \$\$1)\}' \textbackslash} - - \texttt{\mktab{}{ }{ }{ }{ }FPAT='([\^{},]+)|("[\^{}"]+")' table-3.csv > \$@} - - \vspace{0.5em} - \texttt{\recipecomment{Delete the temporary CSV file.}} - - \texttt{\mktab{}\mkprog{rm} table-3.csv} - - \vspace{2em} - \texttt{\mkcomment{3RD MAKE RULE: Main LaTeX macro file for reported values.}} - - \texttt{\mktarget{\$(mtexdir)/analysis1.tex}: \$(\mkvar{mk20tab3)}} - - \texttt{\recipecomment{Count the total number of papers in their study to report in this paper.}} - - \texttt{\mktab{}v=\$\$(\mkprog{awk} '\!/\^{}\#/\{c+=\$\$2\} END\{print c\}' \$(\mkvar{mk20tab3)})} - - \texttt{\mktab{}\mkprog{echo} "\textbackslash{}newcommand\{\textbackslash{}menkenumpapers\}\{\$\$v\}" > \$@} - - \vspace{0.5em} - \texttt{\recipecomment{Count total number of journals in that study.}} - - \texttt{\mktab{}v=\$\$(awk 'BEGIN{FIELDWIDTHS="31 10000"} !/\^\#/\{print \$\$2\}' \$(mk20tab3) \textbackslash} - - \texttt{\mktab{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }| uniq | wc -l)} - - \texttt{\mktab{}\mkprog{echo} "\textbackslash{}newcommand\{\textbackslash{}menkenumjournals\}\{\$\$v\}" >> \$@} -\end{tcolorbox} diff --git a/tex/src/figure-src-demoplot.tex b/tex/src/figure-src-demoplot.tex new file mode 100644 index 0000000..6d788f5 --- /dev/null +++ b/tex/src/figure-src-demoplot.tex @@ -0,0 +1,32 @@ +\begin{tcolorbox}[title=\inlinecode{\textcolor{white}{demo-plot.mk}}\hfill(Simplified contents)] + \footnotesize + \texttt{\mkcomment{1ST MAKE RULE: build the directory hosting the converted table.}}\\ + \texttt{\mkvar{a2dir} = \$(\mkvar{texdir})/tools-per-year}\\ + \texttt{\mktarget{\$(a2dir)}:; \mkprog{mkdir} \$@} + + \vspace{2em} + \texttt{\mkcomment{2ND MAKE RULE: extract necessary info from raw table.}}\\ + \texttt{\mkvar{a2mk20f1c} = \$(\mkvar{a2dir})/tools-per-year.txt}\\ + \texttt{\mktarget{\$(a2mk20f1c)}: \$(\mkvar{mk20tab3}) | \$(\mkvar{a2dir})}\\ + \texttt{\mktab{}\mkprog{awk} '!/\^{}\#/ \{all[\$\$1]+=\$\$2; id[\$\$1]+=\$\$3;\}} \textbackslash\\ + \texttt{\mktab{}{ }{ }{ }{ }{ }END\{ for(year in all) print year, 100*id[year]/all[year], all[year] \}}' \textbackslash\\ + \texttt{\mktab{}{ }{ }{ }{ }> \$@} + + \vspace{2em} + \texttt{\mkcomment{3RD MAKE RULE: Main LaTeX macro file for reported values in text.}}\\ + \texttt{\mkvar{pconfdir} = reproduce/analysis/config}\\ + \texttt{\mktarget{\$(mtexdir)/demo-plot.tex}: \$(\mkvar{a2mk20f1c}) \$(\mkvar{pconfdir})/menke-demo-year.conf} + + %% We need an empty line here for the extra space to work. + \texttt{\recipecomment{First year data were taken (first column of first row).}}\\ + \texttt{\mktab{}v=\$\$(awk 'NR==1\{print \$\$1\}' \$(\mkvar{a2mk20f1c}))}\\ + \texttt{\mktab{}\mkprog{echo} "\textbackslash{}newcommand\{\textbackslash{}menkefirstyear\}\{\$\$v\}" > \$@} + + %% We need an empty line here for the extra space to work. + \texttt{\recipecomment{Number of papers in the demonstration year. The year is defined in}} + + \texttt{\recipecomment{`\$(pconfdir)/menke-demo-year.conf' as `menke-demo-year' and also passed onto LaTeX.}}\\ + \texttt{\mktab{}v=\$\$(awk '\$\$1==\$(\mkvar{menke-demo-year})\{print \$\$3\}' \$(\mkvar{a2mk20f1c}))}\\ + \texttt{\mktab{}\mkprog{echo} "\textbackslash{}newcommand\{\textbackslash{}menkenumpapersdemocount\}\{\$\$v\}"{ }>> \$@} \\ + \texttt{\mktab{}\mkprog{echo} "\textbackslash{}newcommand\{\textbackslash{}menkenumpapersdemoyear\}\{\$(\mkvar{menke-demo-year})\}"{ }>> \$@} +\end{tcolorbox} diff --git a/tex/src/figure-src-download.tex b/tex/src/figure-src-download.tex new file mode 100644 index 0000000..74026b8 --- /dev/null +++ b/tex/src/figure-src-download.tex @@ -0,0 +1,8 @@ +\begin{tcolorbox}[title=\inlinecode{\textcolor{white}{download.mk}} \textcolor{white}{(only \LaTeX{} macro's rule.}] + \footnotesize + \texttt{\mkcomment{Write download URL into the paper (through a LaTeX macro).}} + + \texttt{\mktarget{\$(mtexdir)/download.tex}: \$(\mkvar{indir})/menke20.xlsx} + + \texttt{\mktab{}\mkprog{echo} "\textbackslash{}newcommand{\textbackslash{}menketwentyurl}\{\mktarget{\$(MK20URL)}\}" > \$@} +\end{tcolorbox} diff --git a/tex/src/figure-src-format.tex b/tex/src/figure-src-format.tex new file mode 100644 index 0000000..ba4458e --- /dev/null +++ b/tex/src/figure-src-format.tex @@ -0,0 +1,59 @@ +\begin{tcolorbox}[title=\inlinecode{\textcolor{white}{format.mk}}\hfill(Simplified contents)] + \footnotesize + \texttt{\mkcomment{1ST MAKE RULE: build the directory hosting the converted table.}} + + \texttt{\mkvar{a1dir} = \$(\mkvar{BDIR})/format} + + \texttt{\mktarget{\$(a1dir)}:} + + \texttt{\mktab{}\mkprog{mkdir} \$@} + + \vspace{2em} + \texttt{\mkcomment{2ND MAKE RULE: Convert the XLSX table to a simple plain-text table.}} + + \texttt{\mkvar{mk20tab3} = \$(\mkvar{a1dir})/menke20-table-3.txt} + + \texttt{\mktarget{\$(mk20tab3)}: \$(\mkvar{indir})/menke20.xlsx | \$(\mkvar{a1dir})} + + \texttt{\recipecomment{Call XLSX I/O to convert all the spreadsheets into different CSV files.}} + + \texttt{\recipecomment{We only want the `table-3' spreadsheet, but XLSX I/O doesn't allow setting its}} + + \texttt{\recipecomment{output filename. For simplicity, let's assume its written in `table-3.csv'.}} + + \texttt{\mktab{}\mkprog{xlsxio\_xlsx2csv} \$<} + + \vspace{0.5em} + \texttt{\recipecomment{Use GNU AWK to keep the desired columns in space-separated, fixed-width format.}} + + \texttt{\recipecomment{With `FPAT' commas within double quotes are not counted as columns.}} + + \texttt{\mktab{}\mkprog{awk} 'NR>1\{printf("\%-10d\%-10d\%-10d \%s\textbackslash{}n", \$\$2, \$\$3, \$\$(NF-1)*\$\$NF, \$\$1)\}' \textbackslash} + + \texttt{\mktab{}{ }{ }{ }{ }FPAT='([\^{},]+)|("[\^{}"]+")' table-3.csv > \$@} + + \vspace{0.5em} + \texttt{\recipecomment{Delete the temporary CSV file.}} + + \texttt{\mktab{}\mkprog{rm} table-3.csv} + + \vspace{2em} + \texttt{\mkcomment{3RD MAKE RULE: Main LaTeX macro file for reported values.}} + + \texttt{\mktarget{\$(mtexdir)/format.tex}: \$(\mkvar{mk20tab3)}} + + \texttt{\recipecomment{Count the total number of papers in their study to report in this paper.}} + + \texttt{\mktab{}v=\$\$(\mkprog{awk} '\!/\^{}\#/\{c+=\$\$2\} END\{print c\}' \$(\mkvar{mk20tab3)})} + + \texttt{\mktab{}\mkprog{echo} "\textbackslash{}newcommand\{\textbackslash{}menkenumpapers\}\{\$\$v\}" > \$@} + + \vspace{0.5em} + \texttt{\recipecomment{Count total number of journals in that study.}} + + \texttt{\mktab{}v=\$\$(awk 'BEGIN{FIELDWIDTHS="31 10000"} !/\^\#/\{print \$\$2\}' \$(mk20tab3) \textbackslash} + + \texttt{\mktab{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }| uniq | wc -l)} + + \texttt{\mktab{}\mkprog{echo} "\textbackslash{}newcommand\{\textbackslash{}menkenumjournals\}\{\$\$v\}" >> \$@} +\end{tcolorbox} diff --git a/tex/src/figure-src-topmake.tex b/tex/src/figure-src-topmake.tex new file mode 100644 index 0000000..bd4b67d --- /dev/null +++ b/tex/src/figure-src-topmake.tex @@ -0,0 +1,24 @@ +\begin{tcolorbox}[title=\inlinecode{\textcolor{white}{top-make.mk}}\hfill\textcolor{white}{(simplified)}] + \footnotesize + + \texttt{\mkcomment{Ultimate target/purpose of project (`paper.pdf' is the final target of the final subMakefile}}\par + \texttt{\mkcomment{that is loaded/included below)}}\par + \texttt{\mktarget{all}: paper.pdf} + + \vspace{1em} + \texttt{\mkcomment{List of subMakefiles to be loaded in order.}}\par + \texttt{\mkvar{makesrc} = initialize \textbackslash}\par + \texttt{{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }download \textbackslash}\par + \texttt{{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }format \textbackslash}\par + \texttt{{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }demo-plot \textbackslash}\par + \texttt{{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }verify \textbackslash}\par + \texttt{{ }{ }{ }{ }{ }{ }{ }{ }{ }{ }paper}\par + + \vspace{1em} + \texttt{\mkcomment{Load all the configuration files.}}\par + \texttt{\textcolor{purple}{include} reproduce/analysis/config/*.conf} + + \vspace{1em} + \texttt{\mkcomment{Load/include the subMakefiles in the specified order.}}\par + \texttt{\textcolor{purple}{include} \$(\textcolor{blue}{foreach} s, \$(\mkvar{makesrc}), reproduce/analysis/make/\$(\mkvar{s}).mk)} +\end{tcolorbox} diff --git a/tex/src/figure-tools-per-year.tex b/tex/src/figure-tools-per-year.tex new file mode 100644 index 0000000..75557ac --- /dev/null +++ b/tex/src/figure-tools-per-year.tex @@ -0,0 +1,34 @@ +\begin{tikzpicture} + \begin{axis}[ + ymin=0, + ymax=100, + width=\linewidth, + height=0.3\linewidth, + xlabel={Year}, + ylabel={Frac. papers with tools}, + axis y line*=left, + enlarge x limits = false, + yticklabel=\pgfmathprintnumber{\tick}\,\%, + x tick label style={/pgf/number format/1000 sep=}, + ] + + %% Linear plot, showing the number of papers mentioning tools. + \addplot+ [mark=none, very thick, green!60!black] + table {tex/build/tools-per-year/tools-per-year.txt}; + \end{axis} + + %% Add the right-side Y axis. + \begin{axis}[ + ymode=log, + width=\linewidth, + height=0.3\linewidth, + axis x line=none, + axis y line*=right, + enlarge x limits = false, + ylabel=Num. papers (log-scale), + max space between ticks=20, + ] + \addplot+ [ybar, mark=none, fill=red!50!white, red, opacity=0.25] + table [x index=0, y index=2] {tex/build/tools-per-year/tools-per-year.txt}; + \end{axis} +\end{tikzpicture} -- cgit v1.2.1