aboutsummaryrefslogtreecommitdiff
path: root/reproduce/analysis/make
diff options
context:
space:
mode:
authorMohammad Akhlaghi <mohammad@akhlaghi.org>2020-02-16 03:31:26 +0000
committerMohammad Akhlaghi <mohammad@akhlaghi.org>2020-02-16 03:31:26 +0000
commit13cb65a2eab708b0bd4777601331b3e83e96beac (patch)
tree60c7c846592e698789522cf7d80c18cf404473e1 /reproduce/analysis/make
parent241515c3dc6ef7f7ee6b0db22312e316196ecb89 (diff)
Menke+2020 data is now imported and ready for later steps in plain text
The main problems with this dataset was the names of the journals (which sometimes have single quotes or apostrophes in them that is really annoying for SED)! But ultimately, for the simple study we want to do here, the journal names are irrelevant, so in the end I just ignored the names. Later we can set an identifier for the journals if necessary. But now we have the basic information in a way that is usable in a plot to show in this paper.
Diffstat (limited to 'reproduce/analysis/make')
-rw-r--r--reproduce/analysis/make/download.mk12
-rw-r--r--reproduce/analysis/make/menke2020.mk65
-rw-r--r--reproduce/analysis/make/top-make.mk1
3 files changed, 72 insertions, 6 deletions
diff --git a/reproduce/analysis/make/download.mk b/reproduce/analysis/make/download.mk
index 7c290f4..7e61cb8 100644
--- a/reproduce/analysis/make/download.mk
+++ b/reproduce/analysis/make/download.mk
@@ -49,12 +49,12 @@
# progress at every moment.
$(indir):; mkdir $@
downloadwrapper = $(bashdir)/download-multi-try
-inputdatasets = $(foreach i, wfpc2, $(indir)/$(i).fits)
-$(inputdatasets): $(indir)/%.fits: | $(indir) $(lockdir)
+inputdatasets = $(indir)/menke-etal-2020.xlsx
+$(inputdatasets): $(indir)/%: | $(indir) $(lockdir)
# Set the necessary parameters for this input file.
- if [ $* = wfpc2 ]; then
- origname=$(WFPC2IMAGE); url=$(WFPC2URL); mdf=$(WFPC2MD5);
+ if [ $* = menke-etal-2020.xlsx ]; then
+ origname=$(MK20DATA); fullurl=$(MK20URL); mdf=$(MK20MD5);
else
echo; echo; echo "Not recognized input dataset: '$*.fits'."
echo; echo; exit 1
@@ -72,7 +72,7 @@ $(inputdatasets): $(indir)/%.fits: | $(indir) $(lockdir)
else
touch $(lockdir)/download
$(downloadwrapper) "wget --no-use-server-timestamps -O" \
- $(lockdir)/download $$url/$$origname $@
+ $(lockdir)/download $$fullurl $@
fi
# Check the md5 sum to see if this is the proper dataset.
@@ -94,4 +94,4 @@ $(inputdatasets): $(indir)/%.fits: | $(indir) $(lockdir)
# It is very important to mention the address where the data were
# downloaded in the final report.
$(mtexdir)/download.tex: $(pconfdir)/INPUTS.mk | $(mtexdir)
- echo "\\newcommand{\\wfpctwourl}{$(WFPC2URL)}" > $@
+ echo > $@
diff --git a/reproduce/analysis/make/menke2020.mk b/reproduce/analysis/make/menke2020.mk
new file mode 100644
index 0000000..df87080
--- /dev/null
+++ b/reproduce/analysis/make/menke2020.mk
@@ -0,0 +1,65 @@
+# Use the data from Menke 2020 (DOI:10.1101/2020.01.15.908111) as a
+# demonstration analysis for this paper. This is a relevant paper because
+# it provides good statistics about the status of reproducibility in
+# scientific publications.
+#
+# Copyright (C) 2020 Mohammad Akhlaghi <mohammad@akhlaghi.org>
+#
+# This Makefile is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation, either version 3 of the License, or (at your
+# option) any later version.
+#
+# This Makefile is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details. See <http://www.gnu.org/licenses/>.
+
+
+
+
+# Save the "Table 3" spreadsheet from the downloaded `.xlsx' file into a
+# simple plain-text file that is easy to use.
+mk20dir = $(BDIR)/menke2020
+mk20tab3 = $(mk20dir)/table-3.txt
+$(mk20dir):; mkdir $@
+$(mk20tab3): $(indir)/menke-etal-2020.xlsx | $(mk20dir)
+
+ # Set a base-name for the table-3 data.
+ base=$(basename $(notdir $<))-table-3
+
+ # Unfortunately XLSX I/O only works when the input and output are
+ # in the directory it is running. So first, we need to switch to
+ # the input directory, run it, then put our desired output where we
+ # want and delete the extra files.
+ topdir=$$(pwd)
+ cd $(indir)
+ xlsxio_xlsx2csv $(notdir $<)
+ cp $(notdir $<)."Table 3 All by journal by year".csv $$base.csv
+ rm $(notdir $<).*.csv
+ cd $$topdir
+
+ # Read the necessary information. Note that we are dealing with a
+ # CSV (comma-separated value) file. But when there are commas in a
+ # string, quotation signs are put around it. The `FPAT' values is
+ # fully described in the GNU AWK manual. In short, it ensures that
+ # if there is a comma in the middle of double-quotes, it doesn't
+ # count as a delimter.
+ echo "# Column 1: YEAR [counter, i16] Year of journal's publication." > $@.tmp
+ echo "# Column 2: NUM_PAPERS [counter, i16] Number of studied papers in that journal." >> $@.tmp
+ echo "# Column 3: NUM_ID_TOOLS [counter, i16] Number of software/tools that were identified." >> $@.tmp
+ awk 'NR>1{printf("%-6d%-5d%d\n", $$2, $$3, $$(NF-1)*$$NF)}' \
+ FPAT='([^,]+)|("[^"]+")' $(indir)/$$base.csv >> $@.tmp
+
+ # Set the temporary file as the final target. This was done so if
+ # there is any possible crash in the steps above, this rule is
+ # re-run (its final target isn't rebuilt).
+ mv $@.tmp $@
+
+
+
+
+
+# Main LaTeX macro file
+$(mtexdir)/menke2020.tex: $(mk20tab3) | $(mtexdir)
+ touch $@
diff --git a/reproduce/analysis/make/top-make.mk b/reproduce/analysis/make/top-make.mk
index acbbafa..29bcd83 100644
--- a/reproduce/analysis/make/top-make.mk
+++ b/reproduce/analysis/make/top-make.mk
@@ -113,6 +113,7 @@ endif
makesrc = initialize \
download \
verify \
+ menke2020 \
paper