aboutsummaryrefslogtreecommitdiff
path: root/reproduce/analysis/make
diff options
context:
space:
mode:
Diffstat (limited to 'reproduce/analysis/make')
-rw-r--r--reproduce/analysis/make/delete-me.mk129
-rw-r--r--reproduce/analysis/make/demo-plot.mk59
-rw-r--r--reproduce/analysis/make/download.mk14
-rw-r--r--reproduce/analysis/make/format.mk82
-rw-r--r--reproduce/analysis/make/initialize.mk1
-rw-r--r--reproduce/analysis/make/paper.mk6
-rw-r--r--reproduce/analysis/make/top-make.mk3
-rw-r--r--reproduce/analysis/make/verify.mk3
8 files changed, 155 insertions, 142 deletions
diff --git a/reproduce/analysis/make/delete-me.mk b/reproduce/analysis/make/delete-me.mk
deleted file mode 100644
index 8b97673..0000000
--- a/reproduce/analysis/make/delete-me.mk
+++ /dev/null
@@ -1,129 +0,0 @@
-# Dummy Makefile to create a random dataset for plotting.
-#
-# Copyright (C) 2018-2020 Mohammad Akhlaghi <mohammad@akhlaghi.org>
-#
-# This Makefile is free software: you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by the
-# Free Software Foundation, either version 3 of the License, or (at your
-# option) any later version.
-#
-# This Makefile is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-# Public License for more details.
-#
-# A copy of the GNU General Public License is available at
-# <http://www.gnu.org/licenses/>.
-
-
-
-
-
-# Dummy dataset
-# -------------
-#
-# We will use AWK to generate a table showing X and X^2 and draw its plot.
-delete-numdir = $(texdir)/delete-me-num
-delete-num = $(delete-numdir)/data.txt
-$(delete-numdir): | $(texdir); mkdir $@
-$(delete-num): $(pconfdir)/delete-me-num.conf | $(delete-numdir)
-
- # When the plotted values are re-made, it is necessary to also
- # delete the TiKZ externalized files so the plot is also re-made.
- rm -f $(tikzdir)/delete-me.pdf
-
- # Generate the table of random values.
- awk 'BEGIN {for(i=1;i<=$(delete-me-num);i+=0.5) print i, i*i; }' > $@
-
-
-
-
-
-# WFPC2 image PDF
-# -----------------
-#
-# For an example image, we'll make a PDF copy of the WFPC II image to
-# display in the paper.
-delete-demodir = $(texdir)/delete-me-demo
-$(delete-demodir): | $(texdir); mkdir $@
-delete-pdf = $(delete-demodir)/wfpc2.pdf
-$(delete-pdf): $(delete-demodir)/%.pdf: $(indir)/%.fits | $(delete-demodir)
-
- # When the plotted values are re-made, it is necessary to also
- # delete the TiKZ externalized files so the plot is also re-made.
- rm -f $(tikzdir)/delete-me-wfpc2.pdf
-
- # Convert the dataset to a PDF.
- astconvertt --colormap=gray --fluxhigh=4 $< -h0 -o$@
-
-
-
-
-
-# Histogram of WFPC2 image
-# ------------------------
-#
-# For an example plot, we'll show the pixel value histogram also.
-delete-histogram = $(delete-demodir)/wfpc2-hist.txt
-$(delete-histogram): $(delete-demodir)/%-hist.txt: $(indir)/%.fits \
- | $(delete-demodir)
-
- # When the plotted values are re-made, it is necessary to also
- # delete the TiKZ externalized files so the plot is also re-made.
- rm -f $(tikzdir)/delete-me-wfpc2.pdf
-
- # Generate the pixel value distribution
- aststatistics --lessthan=5 $< -h0 --histogram -o$@
-
-
-
-
-
-# Basic statistics
-# ----------------
-#
-# This is just as a demonstration on how to get analysic configuration
-# parameters from variables defined in `reproduce/analysis/config/'.
-delete-stats = $(delete-demodir)/wfpc2-stats.txt
-$(delete-stats): $(delete-demodir)/%-stats.txt: $(indir)/%.fits \
- | $(delete-demodir)
- aststatistics $< -h0 --mean --median > $@
-
-
-
-
-
-# TeX macros
-# ----------
-#
-# This is how we write the necessary parameters in the final PDF.
-#
-# NOTE: In LaTeX you cannot use any non-alphabetic character in a variable
-# name.
-$(mtexdir)/delete-me.tex: $(delete-num) $(delete-pdf) $(delete-histogram) \
- $(delete-stats)
-
- # Write the number of random values used.
- echo "\newcommand{\deletemenum}{$(delete-me-num)}" > $@
-
- # Note that since Make variables start with a `$(', if you want to
- # use `$' within the shell (not Make), you have to quote any
- # occurance of `$' with another `$'. That is why there are `$$' in
- # the AWK command below.
- #
- # Here, we are first using AWK to find the minimum and maximum
- # values, then using it again to read each separately to use in the
- # macro definition.
- mm=$$(awk 'BEGIN{min=99999; max=-min}
- !/^#/{if($$2>max) max=$$2; if($$2<min) min=$$2;}
- END{print min, max}' $(delete-num));
- v=$$(echo "$$mm" | awk '{printf "%.3f", $$1}');
- echo "\newcommand{\deletememin}{$$v}" >> $@
- v=$$(echo "$$mm" | awk '{printf "%.3f", $$2}');
- echo "\newcommand{\deletememax}{$$v}" >> $@
-
- # Write the statistics of the WFPC2 image as a macro.
- mean=$$(awk '{printf("%.2f", $$1)}' $(delete-stats))
- echo "\newcommand{\deletemewfpctwomean}{$$mean}" >> $@
- median=$$(awk '{printf("%.2f", $$2)}' $(delete-stats))
- echo "\newcommand{\deletemewfpctwomedian}{$$median}" >> $@
diff --git a/reproduce/analysis/make/demo-plot.mk b/reproduce/analysis/make/demo-plot.mk
new file mode 100644
index 0000000..caf77af
--- /dev/null
+++ b/reproduce/analysis/make/demo-plot.mk
@@ -0,0 +1,59 @@
+# Second step of analysis:
+# Data for plot of number/fraction of tools per year.
+#
+# Copyright (C) 2020 Mohammad Akhlaghi <mohammad@akhlaghi.org>
+#
+# This Makefile is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, either version 3 of the License, or (at your
+# option) any later version.
+#
+# This Makefile is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details. See <http://www.gnu.org/licenses/>.
+
+
+
+
+# Directory to host outputs
+# -------------------------
+a2dir = $(texdir)/tools-per-year
+$(a2dir):; mkdir $@
+
+
+
+
+
+# Table for Figure 1C of Menke+20
+# -------------------------------
+a2mk20f1c = $(a2dir)/tools-per-year.txt
+$(a2mk20f1c): $(mk20tab3) | $(a2dir)
+
+ # Remove the (possibly) produced figure that is created from this
+ # table: it is created by LaTeX's TiKZ package, and includes
+ # multiple files with a fixed prefix.
+ rm -f $(tikzdir)/figure-tools-per-year*
+
+ # Find the maximum number of papers.
+ awk '!/^#/{all[$$1]+=$$2; id[$$1]+=$$3} \
+ END{ for(year in all) \
+ print year, 100*id[year]/all[year], all[year] \
+ }' $< \
+ > $@
+
+
+
+
+
+# Final LaTeX macro
+$(mtexdir)/demo-plot.tex: $(a2mk20f1c) $(pconfdir)/menke-demo-year.conf
+
+ # Find the first year (first column of first row) of data.
+ v=$$(awk 'NR==1{print $$1}' $(a2mk20f1c))
+ echo "\newcommand{\menkefirstyear}{$$v}" > $@
+
+ # Find the number of papers in 1996.
+ v=$$(awk '$$1==$(menke-demo-year){print $$3}' $(a2mk20f1c))
+ echo "\newcommand{\menkenumpapersdemocount}{$$v}" >> $@
+ echo "\newcommand{\menkenumpapersdemoyear}{$(menke-demo-year)}" >> $@
diff --git a/reproduce/analysis/make/download.mk b/reproduce/analysis/make/download.mk
index 07e9f27..cf5bfa4 100644
--- a/reproduce/analysis/make/download.mk
+++ b/reproduce/analysis/make/download.mk
@@ -50,12 +50,12 @@
# progress at every moment.
$(indir):; mkdir $@
downloadwrapper = $(bashdir)/download-multi-try
-inputdatasets = $(foreach i, wfpc2, $(indir)/$(i).fits)
-$(inputdatasets): $(indir)/%.fits: | $(indir) $(lockdir)
+inputdatasets = $(indir)/menke20.xlsx
+$(inputdatasets): $(indir)/%: | $(indir) $(lockdir)
# Set the necessary parameters for this input file.
- if [ $* = wfpc2 ]; then
- origname=$(WFPC2IMAGE); url=$(WFPC2URL); mdf=$(WFPC2MD5);
+ if [ $* = menke20.xlsx ]; then
+ origname=$(MK20DATA); fullurl=$(MK20URL); mdf=$(MK20MD5);
else
echo; echo; echo "Not recognized input dataset: '$*.fits'."
echo; echo; exit 1
@@ -73,7 +73,7 @@ $(inputdatasets): $(indir)/%.fits: | $(indir) $(lockdir)
else
touch $(lockdir)/download
$(downloadwrapper) "wget --no-use-server-timestamps -O" \
- $(lockdir)/download $$url/$$origname $@
+ $(lockdir)/download $$fullurl $@
fi
# Check the md5 sum to see if this is the proper dataset.
@@ -94,5 +94,5 @@ $(inputdatasets): $(indir)/%.fits: | $(indir) $(lockdir)
#
# It is very important to mention the address where the data were
# downloaded in the final report.
-$(mtexdir)/download.tex: $(pconfdir)/INPUTS.conf | $(mtexdir)
- echo "\\newcommand{\\wfpctwourl}{$(WFPC2URL)}" > $@
+$(mtexdir)/download.tex: $(indir)/menke20.xlsx | $(mtexdir)
+ echo "\newcommand{\menketwentyurl}{$(MK20URL)}" > $@
diff --git a/reproduce/analysis/make/format.mk b/reproduce/analysis/make/format.mk
new file mode 100644
index 0000000..868c411
--- /dev/null
+++ b/reproduce/analysis/make/format.mk
@@ -0,0 +1,82 @@
+# First step of analysis:
+# Prepare the data, return basic values.
+#
+# As a demonstration analysis to go with the paper, we use the data from
+# Menke 2020 (DOI:10.1101/2020.01.15.908111). This is a relevant paper
+# because it provides interesting statistics about tools and methods used
+# in scientific papers.
+#
+# Copyright (C) 2020 Mohammad Akhlaghi <mohammad@akhlaghi.org>
+#
+# This Makefile is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, either version 3 of the License, or (at your
+# option) any later version.
+#
+# This Makefile is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details. See <http://www.gnu.org/licenses/>.
+
+
+
+
+# Save the "Table 3" spreadsheet from the downloaded `.xlsx' file into a
+# simple plain-text file that is easy to use.
+a1dir = $(BDIR)/analysis1
+mk20tab3 = $(a1dir)/menke20-table-3.txt
+$(a1dir):; mkdir $@
+$(mk20tab3): $(indir)/menke20.xlsx | $(a1dir)
+
+ # Set a base-name for the table-3 data.
+ base=$(basename $(notdir $<))-table-3
+
+ # Unfortunately XLSX I/O only works when the input and output are
+ # in the directory it is running. So first, we need to switch to
+ # the input directory, run it, then put our desired output where we
+ # want and delete the extra files.
+ topdir=$$(pwd)
+ cd $(indir)
+ xlsxio_xlsx2csv $(notdir $<)
+ cp $(notdir $<)."Table 3 All by journal by year".csv $$base.csv
+ rm $(notdir $<).*.csv
+ cd $$topdir
+
+ # Read the necessary information. Note that we are dealing with a
+ # CSV (comma-separated value) file. But when there are commas in a
+ # string, quotation signs are put around it. The `FPAT' values is
+ # fully described in the GNU AWK manual. In short, it ensures that
+ # if there is a comma in the middle of double-quotes, it doesn't
+ # count as a delimter.
+ echo "# Column 1: YEAR [counter, i16] Year of journal's publication." > $@.tmp
+ echo "# Column 2: NUM_PAPERS [counter, i16] Number of studied papers in that journal." >> $@.tmp
+ echo "# Column 3: NUM_PAPERS_WITH_TOOLS [counter, i16] Number of papers with an identified tool." >> $@.tmp
+ echo "# Column 4: NUM_ID_TOOLS [counter, i16] Number of software/tools that were identified." >> $@.tmp
+ echo "# Column 5: JOURNAL_NAME [string, str150] Name of journal." >> $@.tmp
+ awk 'NR>1{printf("%-10d%-10d%-10d%-10d %s\n", $$2, $$3, $$3*$$NF, $$(NF-1), $$1)}' \
+ FPAT='([^,]+)|("[^"]+")' $(indir)/$$base.csv >> $@.tmp
+
+ # Set the temporary file as the final target. This was done so if
+ # there is any possible crash in the steps above, this rule is
+ # re-run (its final target isn't rebuilt).
+ mv $@.tmp $@
+
+
+
+
+
+# Main LaTeX macro file
+$(mtexdir)/format.tex: $(mk20tab3)
+
+ # Count the total number of papers in their study.
+ v=$$(awk '!/^#/{c+=$$2} END{print c}' $(mk20tab3))
+ echo "\newcommand{\menkenumpapers}{$$v}" > $@
+
+ # Count how many unique journals there were in the study. Note that
+ # the `31' comes because we put 10 characters for each numeric
+ # column and separated the last numeric column from the string
+ # column with a space. If the number of numeric columns change in
+ # the future, the `31' also has to change.
+ v=$$(awk 'BEGIN{FIELDWIDTHS="41 10000"} !/^#/{print $$2}' \
+ $(mk20tab3) | uniq | wc -l)
+ echo "\newcommand{\menkenumjournals}{$$v}" >> $@
diff --git a/reproduce/analysis/make/initialize.mk b/reproduce/analysis/make/initialize.mk
index 79f9266..ce4e488 100644
--- a/reproduce/analysis/make/initialize.mk
+++ b/reproduce/analysis/make/initialize.mk
@@ -132,6 +132,7 @@ curdir := $(shell echo $$(pwd))
# we are also going to overwrite `TEXINPUTS' just before `pdflatex'.
.ONESHELL:
.SHELLFLAGS = -ec
+export TERM=xterm
export TEXINPUTS :=
export CCACHE_DISABLE := 1
export PATH := $(installdir)/bin
diff --git a/reproduce/analysis/make/paper.mk b/reproduce/analysis/make/paper.mk
index 8a14573..67db364 100644
--- a/reproduce/analysis/make/paper.mk
+++ b/reproduce/analysis/make/paper.mk
@@ -44,7 +44,7 @@ $(mtexdir)/project.tex: $(mtexdir)/verify.tex
# If no PDF is requested, or if LaTeX isn't available, don't
# continue to building the final PDF. Otherwise, merge all the TeX
# macros into one for building the PDF.
- @if [ -f .local/bin/pdflatex ] && [ x"$(pdf-build-final)" != x ]; then
+ @if [ -f .local/bin/lualatex ] && [ x"$(pdf-build-final)" != x ]; then
# Put a LaTeX input command for all the necessary macro files.
rm -f $(mtexdir)/project.tex
@@ -100,7 +100,7 @@ $(texbdir)/paper.bbl: tex/src/references.tex $(mtexdir)/dependencies-bib.tex \
p=$$(pwd)
export TEXINPUTS=$$p:
cd $(texbdir);
- pdflatex -shell-escape -halt-on-error $$p/paper.tex
+ lualatex -shell-escape -halt-on-error $$p/paper.tex
biber paper
fi
@@ -127,7 +127,7 @@ paper.pdf: $(mtexdir)/project.tex paper.tex $(texbdir)/paper.bbl
p=$$(pwd)
export TEXINPUTS=$$p:
cd $(texbdir)
- pdflatex -shell-escape -halt-on-error $$p/paper.tex
+ lualatex -shell-escape -halt-on-error $$p/paper.tex
# Come back to the top project directory and copy the built PDF
# file here.
diff --git a/reproduce/analysis/make/top-make.mk b/reproduce/analysis/make/top-make.mk
index 6c940b8..30d537a 100644
--- a/reproduce/analysis/make/top-make.mk
+++ b/reproduce/analysis/make/top-make.mk
@@ -112,7 +112,8 @@ endif
# wild-card like the configuration Makefiles).
makesrc = initialize \
download \
- delete-me \
+ format \
+ demo-plot \
verify \
paper
diff --git a/reproduce/analysis/make/verify.mk b/reproduce/analysis/make/verify.mk
index d11dcbf..f0bcf10 100644
--- a/reproduce/analysis/make/verify.mk
+++ b/reproduce/analysis/make/verify.mk
@@ -114,8 +114,7 @@ $(mtexdir)/verify.tex: $(foreach s, $(verify-dep), $(mtexdir)/$(s).tex)
# Verify TeX macros (the values that go into the PDF text).
for m in $(verify-check); do
file=$(mtexdir)/$$m.tex
- if [ $$m == download ]; then s=6749e17ce606d57d30cebdbc1a5d23ad
- elif [ $$m == delete-me ]; then s=711e2f7fa1f16ecbeeb3df6bcb4ec705
+ if [ $$m == download ]; then s=XXXXX
else echo; echo "'$$m' not recognized."; exit 1
fi
$(call verify-txt-no-comments-leading-space, $$file, $$s)