diff options
author | Mohammad Akhlaghi <mohammad@akhlaghi.org> | 2020-02-16 03:31:26 +0000 |
---|---|---|
committer | Mohammad Akhlaghi <mohammad@akhlaghi.org> | 2020-02-16 03:31:26 +0000 |
commit | 13cb65a2eab708b0bd4777601331b3e83e96beac (patch) | |
tree | 60c7c846592e698789522cf7d80c18cf404473e1 | |
parent | 241515c3dc6ef7f7ee6b0db22312e316196ecb89 (diff) |
Menke+2020 data is now imported and ready for later steps in plain text
The main problems with this dataset was the names of the journals (which
sometimes have single quotes or apostrophes in them that is really annoying
for SED)! But ultimately, for the simple study we want to do here, the
journal names are irrelevant, so in the end I just ignored the names. Later
we can set an identifier for the journals if necessary.
But now we have the basic information in a way that is usable in a plot to
show in this paper.
-rw-r--r-- | reproduce/analysis/config/INPUTS.mk | 8 | ||||
-rw-r--r-- | reproduce/analysis/make/download.mk | 12 | ||||
-rw-r--r-- | reproduce/analysis/make/menke2020.mk | 65 | ||||
-rw-r--r-- | reproduce/analysis/make/top-make.mk | 1 | ||||
-rwxr-xr-x | reproduce/software/bash/configure.sh | 15 | ||||
-rw-r--r-- | tex/src/references.tex | 2 |
6 files changed, 84 insertions, 19 deletions
diff --git a/reproduce/analysis/config/INPUTS.mk b/reproduce/analysis/config/INPUTS.mk index 6ddaec7..9332df3 100644 --- a/reproduce/analysis/config/INPUTS.mk +++ b/reproduce/analysis/config/INPUTS.mk @@ -9,7 +9,7 @@ # this notice are preserved. This file is offered as-is, without any # warranty. -WFPC2IMAGE = WFPC2ASSNu5780205bx.fits -WFPC2MD5 = a4791e42cd1045892f9c41f11b50bad8 -WFPC2SIZE = 62kb -WFPC2URL = https://fits.gsfc.nasa.gov/samples +MK20DATA = menke-etal-2020.xlsx +MK20MD5 = 8e4eee64791f351fec58680126d558a0 +MK20SIZE = 1.9MB +MK20URL = https://www.biorxiv.org/content/biorxiv/early/2020/01/18/2020.01.15.908111/DC1/embed/media-1.xlsx diff --git a/reproduce/analysis/make/download.mk b/reproduce/analysis/make/download.mk index 7c290f4..7e61cb8 100644 --- a/reproduce/analysis/make/download.mk +++ b/reproduce/analysis/make/download.mk @@ -49,12 +49,12 @@ # progress at every moment. $(indir):; mkdir $@ downloadwrapper = $(bashdir)/download-multi-try -inputdatasets = $(foreach i, wfpc2, $(indir)/$(i).fits) -$(inputdatasets): $(indir)/%.fits: | $(indir) $(lockdir) +inputdatasets = $(indir)/menke-etal-2020.xlsx +$(inputdatasets): $(indir)/%: | $(indir) $(lockdir) # Set the necessary parameters for this input file. - if [ $* = wfpc2 ]; then - origname=$(WFPC2IMAGE); url=$(WFPC2URL); mdf=$(WFPC2MD5); + if [ $* = menke-etal-2020.xlsx ]; then + origname=$(MK20DATA); fullurl=$(MK20URL); mdf=$(MK20MD5); else echo; echo; echo "Not recognized input dataset: '$*.fits'." echo; echo; exit 1 @@ -72,7 +72,7 @@ $(inputdatasets): $(indir)/%.fits: | $(indir) $(lockdir) else touch $(lockdir)/download $(downloadwrapper) "wget --no-use-server-timestamps -O" \ - $(lockdir)/download $$url/$$origname $@ + $(lockdir)/download $$fullurl $@ fi # Check the md5 sum to see if this is the proper dataset. @@ -94,4 +94,4 @@ $(inputdatasets): $(indir)/%.fits: | $(indir) $(lockdir) # It is very important to mention the address where the data were # downloaded in the final report. $(mtexdir)/download.tex: $(pconfdir)/INPUTS.mk | $(mtexdir) - echo "\\newcommand{\\wfpctwourl}{$(WFPC2URL)}" > $@ + echo > $@ diff --git a/reproduce/analysis/make/menke2020.mk b/reproduce/analysis/make/menke2020.mk new file mode 100644 index 0000000..df87080 --- /dev/null +++ b/reproduce/analysis/make/menke2020.mk @@ -0,0 +1,65 @@ +# Use the data from Menke 2020 (DOI:10.1101/2020.01.15.908111) as a +# demonstration analysis for this paper. This is a relevant paper because +# it provides good statistics about the status of reproducibility in +# scientific publications. +# +# Copyright (C) 2020 Mohammad Akhlaghi <mohammad@akhlaghi.org> +# +# This Makefile is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This Makefile is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. See <http://www.gnu.org/licenses/>. + + + + +# Save the "Table 3" spreadsheet from the downloaded `.xlsx' file into a +# simple plain-text file that is easy to use. +mk20dir = $(BDIR)/menke2020 +mk20tab3 = $(mk20dir)/table-3.txt +$(mk20dir):; mkdir $@ +$(mk20tab3): $(indir)/menke-etal-2020.xlsx | $(mk20dir) + + # Set a base-name for the table-3 data. + base=$(basename $(notdir $<))-table-3 + + # Unfortunately XLSX I/O only works when the input and output are + # in the directory it is running. So first, we need to switch to + # the input directory, run it, then put our desired output where we + # want and delete the extra files. + topdir=$$(pwd) + cd $(indir) + xlsxio_xlsx2csv $(notdir $<) + cp $(notdir $<)."Table 3 All by journal by year".csv $$base.csv + rm $(notdir $<).*.csv + cd $$topdir + + # Read the necessary information. Note that we are dealing with a + # CSV (comma-separated value) file. But when there are commas in a + # string, quotation signs are put around it. The `FPAT' values is + # fully described in the GNU AWK manual. In short, it ensures that + # if there is a comma in the middle of double-quotes, it doesn't + # count as a delimter. + echo "# Column 1: YEAR [counter, i16] Year of journal's publication." > $@.tmp + echo "# Column 2: NUM_PAPERS [counter, i16] Number of studied papers in that journal." >> $@.tmp + echo "# Column 3: NUM_ID_TOOLS [counter, i16] Number of software/tools that were identified." >> $@.tmp + awk 'NR>1{printf("%-6d%-5d%d\n", $$2, $$3, $$(NF-1)*$$NF)}' \ + FPAT='([^,]+)|("[^"]+")' $(indir)/$$base.csv >> $@.tmp + + # Set the temporary file as the final target. This was done so if + # there is any possible crash in the steps above, this rule is + # re-run (its final target isn't rebuilt). + mv $@.tmp $@ + + + + + +# Main LaTeX macro file +$(mtexdir)/menke2020.tex: $(mk20tab3) | $(mtexdir) + touch $@ diff --git a/reproduce/analysis/make/top-make.mk b/reproduce/analysis/make/top-make.mk index acbbafa..29bcd83 100644 --- a/reproduce/analysis/make/top-make.mk +++ b/reproduce/analysis/make/top-make.mk @@ -113,6 +113,7 @@ endif makesrc = initialize \ download \ verify \ + menke2020 \ paper diff --git a/reproduce/software/bash/configure.sh b/reproduce/software/bash/configure.sh index 08f2609..f7fa9c8 100755 --- a/reproduce/software/bash/configure.sh +++ b/reproduce/software/bash/configure.sh @@ -299,10 +299,10 @@ if [ x"$input_dir" = x ]; then else indir=$input_dir fi -wfpc2name=$(awk '!/^#/ && $1=="WFPC2IMAGE" {print $3}' $adir/INPUTS.mk) -wfpc2md5=$(awk '!/^#/ && $1=="WFPC2MD5" {print $3}' $adir/INPUTS.mk) -wfpc2size=$(awk '!/^#/ && $1=="WFPC2SIZE" {print $3}' $adir/INPUTS.mk) -wfpc2url=$(awk '!/^#/ && $1=="WFPC2URL" {print $3}' $adir/INPUTS.mk) +mk20name=$(awk '!/^#/ && $1=="MK20DATA" {print $3}' $adir/INPUTS.mk) +mk20md5=$(awk '!/^#/ && $1=="MK20MD5" {print $3}' $adir/INPUTS.mk) +mk20size=$(awk '!/^#/ && $1=="MK20SIZE" {print $3}' $adir/INPUTS.mk) +mk20url=$(awk '!/^#/ && $1=="MK20URL" {print $3}' $adir/INPUTS.mk) if [ $rewritepconfig = yes ] && [ x"$input_dir" = x ]; then cat <<EOF @@ -315,10 +315,9 @@ please specify the directory hosting them on this system. If you don't, they will be downloaded automatically. Each file is shown with its total volume and its 128-bit MD5 checksum in parenthesis. - $wfpc2name ($wfpc2size, $wfpc2md5): - A 100x100 Hubble Space Telescope WFPC II image used in the FITS - standard webpage as a demonstration of this file format. - URL: $wfpc2url/$wfpc2name + $mk20name ($mk20size, $mk20md5): + Supplementary dataset to http://dx.doi.org/10.1101/2020.01.15.908111 + URL: $mk20url NOTE I: This directory, or the datasets above, are optional. If it doesn't exist, the files will be downloaded in the build directory and used. diff --git a/tex/src/references.tex b/tex/src/references.tex index 63ea0b2..2a67584 100644 --- a/tex/src/references.tex +++ b/tex/src/references.tex @@ -22,7 +22,7 @@ month = "Feb", volume = {491}, number = {4}, - pages = {5317-5329}, + pages = {5317}, doi = {10.1093/mnras/stz3111}, archivePrefix = {arXiv}, eprint = {1911.01430}, |