aboutsummaryrefslogtreecommitdiff
path: root/reproduce/analysis
diff options
context:
space:
mode:
Diffstat (limited to 'reproduce/analysis')
-rw-r--r--reproduce/analysis/config/INPUTS.conf115
-rw-r--r--reproduce/analysis/make/initialize.mk125
2 files changed, 172 insertions, 68 deletions
diff --git a/reproduce/analysis/config/INPUTS.conf b/reproduce/analysis/config/INPUTS.conf
index 75e24de..5970ae3 100644
--- a/reproduce/analysis/config/INPUTS.conf
+++ b/reproduce/analysis/config/INPUTS.conf
@@ -1,9 +1,10 @@
# This project's input file information (metadata).
#
# For each input (external) data file that is used within the project,
-# three variables are suggested here (two of them are mandatory). These
-# variables will be used by 'reproduce/analysis/make/download.mk' to import
-# the dataset into the project (within the build directory):
+# three variables are suggested here (only the verification variable is
+# strictly mandatory). These variables will be used by the download rule of
+# 'reproduce/analysis/make/initialize.mk' to import the dataset into the
+# project (within the build directory):
#
# - If the file already exists locally in '$(INDIR)' (the optional input
# directory that may have been specified at configuration time with
@@ -12,27 +13,53 @@
# files are large.
#
# - If the file doesn't exist in '$(INDIR)', or no input directory was
-# specified at configuration time, then the file is downloaded from a
-# specific URL.
+# specified at configuration time, then the file is downloaded from the
+# specified URL for that dataset.
#
# In both cases, before placing the file (or its link) in the build
-# directory, 'reproduce/analysis/make/download.mk' will check the SHA256
-# checksum of the dataset and if it differs from the pre-defined value (set
-# for that file, here), it will abort (since this is not the intended
-# dataset).
-#
-# Therefore, the two variables specifying the URL and SHA256 checksum of
-# the file are MANDATORY. The third variable (INPUT-%-size) showing the
-# human-readable size of the file (from 'ls -lh') is optional (but
-# recommended: because it gives future scientists to get a feeling of the
-# volume of data they need to input: will become important if the
-# size/number of files is large).
+# directory, the download rule of 'reproduce/analysis/make/initialize.mk'
+# will check the verification of the dataset and if it differs from the
+# pre-defined value (set for that file, here), it will abort (since this is
+# not the intended dataset).
#
+# Verification (two modes)
+# ------------------------
+# - SHA256 checksum. This will check the full contents of the file, and
+# is generic to any data format. However, if the server inserts custom
+# headers like the query date or query code and etc, this form of
+# validation is not useful: because every download will have different
+# headers. In such cases, you should use the other verification methods
+# below. In other words, this method is only good for files that are
+# "static" on the server (and left there unchanged). If the file is
+# generated at request time, the server usually inserts custom run-time
+# dependent headers; making it impossible to verify with an SHA
+# checksum of the whole file.
+# - The FITS Standard's 'DATASUM' (which will only check the data, not
+# the headers). According to the FITS standard, this sum ignores all
+# headers, and is only calculated on a HDU's data. By default, this
+# will require Gnuastro (which can easily calculate and return the
+# value on the command-line), and it assumes HDU number 1 (counting
+# from 0). You can modify the defaults by modifying the rule in
+# 'reproduce/analysis/make/initialize.mk'.
+#
+# Automatic writing of verification
+# ---------------------------------
+# In case you would like Maneage to find the checksum upon downloading, put
+# the string '--auto-replace--' instead of a checksum. This can be helpful
+# for large datasets; where downloading only for adding the checksum is not
+# easy/possible and can be buggy. In this scenario, upon downloading the
+# file its checksum will be calculated and will be replaced with the
+# '--auto-replace--' in this file. But since this file is under version
+# control, be sure to commit all the updated checksums after your downloads
+# are finished!
+#
+# Variable description
+# --------------------
# The naming convension is critical for the input files to be properly
-# imported into the project. In the patterns below, the '%' is the full
-# file name (including its suffix): for example in the demo input of this
-# file in the 'maneage' branch, we have 'INPUT-wfpc2.fits-sha256':
-# therefore, the input file (within the project's '$(indir)') is called
+# imported into Maneage. In the patterns below, the '%' is the full file
+# name (including its suffix): for example in the demo input of this file
+# in the 'maneage' branch, we have 'INPUT-wfpc2.fits-sha256': therefore,
+# the input file (within the project's '$(indir)') is called
# 'wfpc2.fits'. This allows you to simply set '$(indir)/wfpc2.fits' as the
# pre-requisite of any recipe that needs the input file: you will rarely
# (if at all!) need to use these variables directly.
@@ -40,23 +67,17 @@
# INPUT-%-sha256: The sha256 checksum of the file. You can generate the
# SHA256 checksum of a file with the 'sha256sum FILENAME'
# command (where 'FILENAME' is the name of your
-# file). this is very important for an automatic
-# verification of the file: that it hasn't changed
-# between different runs of the project (locally or in
-# the URL). There are more robust checksum algorithms
-# like the 'SHA' standards.
-#
-# AUTOMATIC CHEKSUM CALCULATION: In case you would like
-# Maneage to find the checksum upon downloading, put the
-# string '--auto-replace--' instead of a checksum. This
-# can be helpful for large datasets; where downloading
-# only for adding the checksum is not easy/possible and
-# can be buggy. In this scenario, upon downloading the
-# file its checksum will be calculated and will be
-# replaced with the '--auto-replace--' in this file. But
-# since this file is under version control, be sure to
-# commit all the updated checksums after your downloads
-# are finished!
+# file). Don't use this if you give the 'fitsdatasum'
+# keyvalue.
+#
+# INPUT-%-fitsdatasum: The FITS standard DATASUM value for HDU number 1
+# of the FITS file (counting from 0). Don't use this
+# if you give the 'sha256' keyword.
+#
+# INPUT-%-fitshdu: The HDU identifier (counter from 0, or name) to use
+# for the verification. This is only relevant in the
+# 'fitsdatasum' verification method and optional (if not
+# given, HDU number 1 is used; counting from 0).
#
# INPUT-%-url: The URL to download the file if it is not available
# locally. It can happen that during the first phases of
@@ -70,6 +91,13 @@
# good feeling of the necessary network and storage
# capacity that is necessary to start the project.
#
+# Therefore, the the verification variable is MANDATORY in any case. The
+# variable with a URL is only necessary if you do not have the file
+# locally. However, The size variable is optional (but recommended: because
+# it gives future scientists a feeling of the volume of data they need to
+# input to run your project: will become important if the size/number of
+# files is large).
+#
# The input dataset's name (that goes into the '%') can be different from
# the URL's file name (last component of the URL, after the last '/'). Just
# note that it is assumed that the local copy (outside of your project) is
@@ -87,7 +115,18 @@
-# Demo dataset used in the histogram plot (remove when customizing).
+# Demo dataset used in the histogram plot
+# ---------------------------------------
+#
+# Remove this part while you are entering your project's datasets.
+#
+# Since the demonstration dataset is a FITS file, we have also added the
+# two '$(INPUT-%-fits*)' variables as a demonstration. But they are
+# commented because the SHA256 method is also possible for this file (its
+# not generated on the server at query time; it is a static file on the
+# server).
INPUT-wfpc2.fits-size = 62K
INPUT-wfpc2.fits-url = https://fits.gsfc.nasa.gov/samples/WFPC2ASSNu5780205bx.fits
INPUT-wfpc2.fits-sha256 = 9851bc2bf9a42008ea606ec532d04900b60865daaff2f233e5c8565dac56ad5f
+#INPUT-wfpc2.fits-fitshdu = 0
+#INPUT-wfpc2.fits-fitsdatasum = 2218330266
diff --git a/reproduce/analysis/make/initialize.mk b/reproduce/analysis/make/initialize.mk
index eb5f2ff..7314184 100644
--- a/reproduce/analysis/make/initialize.mk
+++ b/reproduce/analysis/make/initialize.mk
@@ -63,15 +63,18 @@ pconfdir = reproduce/analysis/config
# loaded.
#
# If your project doesn't need any preparation, you can ignore this.
+#
+# The '-' behind the include commands is used for adding the files only if
+# it is possible (they exist). This is necessary because sometimes the user
+# will have only '*.conf' or '*.mk' files, or with 'make clean' (where the
+# preparation Makefile may call initialize.mk before the main
+# 'top-make.mk'). If the '-' is not used, Make will complain about not
+# finding these files.
ifeq (x$(project-phase),xprepare)
$(prepdir):; mkdir $@
else
-include $(bsdir)/preparation-done.mk
+-include $(bsdir)/preparation-done.mk
ifeq (x$(include-prepare-results),xyes)
-# The '-' behind the include is The '-' is used for adding the files only
-# if it is possible (they exist). This is necessary because sometimes the
-# user will have only '*.conf' or '*.mk' files. So, if the '-' is not used,
-# Make will complain about not finding these files.
-include $(prepdir)/*.mk $(prepdir)/*.conf
endif
endif
@@ -227,7 +230,7 @@ project-commit-hash := $(shell \
export LD_LIBRARY_PATH="$(installdir)/lib"; \
echo $$($(installdir)/bin/git describe --dirty --always --long); \
else echo NOGIT; fi)
-project-package-name := maneaged-$(project-commit-hash)
+project-package-name = maneaged-$(project-commit-hash)
project-package-contents = $(texdir)/$(project-package-name)
@@ -438,13 +441,15 @@ dist-software:
-# Download input data
-# --------------------
+# Import input data
+# -----------------
#
-# 'reproduce/analysis/config/INPUTS.conf' contains the input dataset
-# properties. In most cases, you will not need to edit this rule. Simply
-# follow the instructions of 'INPUTS.conf' and set the variables names
-# according to the described standards and everything should be fine.
+# The list files to be imported (downloaded from a server, or linked from a
+# local location), are listed in 'reproduce/analysis/config/INPUTS.conf'
+# along with their URLs and verification checksums. In most cases, you will
+# not need to edit this rule. Simply follow the instructions at the top of
+# 'INPUTS.conf' and set the variables names according to the described
+# standards and everything should be fine.
#
# TECHNICAL NOTE on the '$(foreach, n ...)' loop of 'inputdatasets': we are
# using several (relatively complex!) features particular to Make: In GNU
@@ -465,16 +470,60 @@ dist-software:
# progress at every moment.
$(indir):; mkdir $@
downloadwrapper = $(bashdir)/download-multi-try
-inputdatasets = $(foreach i, \
- $(patsubst INPUT-%-sha256,%, \
- $(filter INPUT-%-sha256,$(.VARIABLES))), \
- $(indir)/$(i))
+inputdatasets := $(foreach i, \
+ $(patsubst INPUT-%-sha256,%, \
+ $(filter INPUT-%-sha256,$(.VARIABLES))) \
+ $(patsubst INPUT-%-fitsdatasum,%, \
+ $(filter INPUT-%-fitsdatasum,$(.VARIABLES))), \
+ $(indir)/$(i))
$(inputdatasets): $(indir)/%: | $(indir) $(lockdir)
-# Set the necessary parameters for this input file as shell variables
-# (to help in readability).
- url=$(INPUT-$*-url)
- sha=$(INPUT-$*-sha256)
+# Starting rule with '@': In case there is a username or password
+# given for the database, we don't want the values to be printed in
+# the terminal as the pipeline is running. We are therefore starting
+# this recipe with an '@' (so Make doesn't print the used
+# commands). To help the user know what is happening (in case they
+# can't tell from the Wget outputs), we'll just start the recipe with
+# a notice on what is being imported.
+ @echo "Importing $@"
+
+# If a username or password has been provided, add them to the WGET
+# command. The two variables are defined in the local configuation
+# file 'reproduce/software/config/LOCAL.conf' that is not under
+# version control. Different servers may use different authentication
+# formats. If the default one doesn't work for your server, comment
+# it and uncomment the one that works. If your serve needs a
+# different kind of authentication format, please add it yourself. In
+# case you need a new format, we encourage you to send the format to
+# us using the link below:
+# https://savannah.nongnu.org/support/?group=reproduce&func=additem
+ authopt=""
+ if [ x"$(DATABASEAUTHTYPE)" != x ]; then
+ case "$(DATABASEAUTHTYPE)" in
+
+# Format: '--user=XXXX --password=YYYY'
+ userpass)
+ if [ x'$(DATABASEUSER)' != x ]; then
+ authopt="--user='$(DATABASEUSER)'"; fi
+ if [ x'$(DATABASEPASS)' != x ]; then
+ authopt="$$authopt --password='$(DATABASEPASS)'"; fi
+ ;;
+
+# Format: --post-data 'username=XXXX&password=YYYY'
+ postdata)
+ if [ x'$(DATABASEUSER)' != x ]; then
+ authopt="--post-data 'username=$(DATABASEUSER)"; fi
+ if [ x'$(DATABASEPASS)' != x ]; then
+ authopt="$$authopt""&password=$(DATABASEPASS)'";
+ else authopt="$$authopt'" # To close the single quote
+ fi
+ ;;
+
+# Unrecognized format.
+ *)
+ echo "Maneage: 'DATABASEAUTHTYPE' format not recognized! Please see the description of this variable in 'reproduce/software/config/LOCAL.conf' for the acceptable values."; exit 1;;
+ esac
+ fi
# Download (or make the link to) the input dataset. If the file
# exists in 'INDIR', it may be a symbolic link to some other place in
@@ -488,13 +537,29 @@ $(inputdatasets): $(indir)/%: | $(indir) $(lockdir)
ln -fs $$(readlink -f $(INDIR)/$*) $$unchecked
else
touch $(lockdir)/download
- $(downloadwrapper) "wget --no-use-server-timestamps -O" \
- $(lockdir)/download $$url $$unchecked
+ $(downloadwrapper) "wget $$authopt --no-use-server-timestamps -O" \
+ $(lockdir)/download $(INPUT-$*-url) $$unchecked
+ fi
+
+# Set the checksum related variables.
+ if [ x"$(INPUT-$*-sha256)" != x ]; then
+ suffix=sha256
+ sumin=$(INPUT-$*-sha256)
+ verifname="SHA256 checksum"
+ sum=$$(sha256sum $$unchecked | awk '{print $$1}')
+ elif [ x"$(INPUT-$*-fitsdatasum)" != x ]; then
+ suffix=fitsdatasum
+ sumin=$(INPUT-$*-fitsdatasum)
+ verifname="FITS standard DATASUM"
+ if [ x"$(INPUT-$*-fitshdu)" = x ]; then hdu=1;
+ else hdu="$(INPUT-$*-fitshdu)"; fi
+ sum=$$(astfits $$unchecked -h$$hdu --datasum | awk '{print $$1}')
+ else
+ echo "$@: checksum for verifyication not recognized!"; exit 1
fi
-# Check the checksum to see if this is the proper dataset.
- sum=$$(sha256sum $$unchecked | awk '{print $$1}')
- if [ $$sum = $$sha ]; then
+# Verify the input.
+ if [ $$sum = $$sumin ]; then
mv $$unchecked $@
echo "Integrity confirmed, using $@ in this project."
@@ -502,11 +567,11 @@ $(inputdatasets): $(indir)/%: | $(indir) $(lockdir)
else
# The user has asked to update the checksum in 'INPUTS.conf'.
- if [ $$sha = "--auto-replace--" ]; then
+ if [ $$sumin = "--auto-replace--" ]; then
# Put the updated 'INPUTS.conf' in a temporary file.
inputstmp=$@.inputs
- awk '{if($$1 == "INPUT-$*-sha256") \
+ awk '{if($$1 == "INPUT-$*-'$$suffix'") \
$$3="'$$sum'"; print}' \
$(pconfdir)/INPUTS.conf > $$inputstmp
@@ -520,10 +585,10 @@ $(inputdatasets): $(indir)/%: | $(indir) $(lockdir)
# Error on non-matching checksums.
else
echo; echo;
- echo "Wrong SHA256 checksum for input file '$*':"
+ echo "Wrong $$verifname for input file '$*':"
echo " File location: $$unchecked"; \
- echo " Expected SHA256 checksum: $$sha"; \
- echo " Calculated SHA256 checksum: $$sum"; \
+ echo " Expected $$verifname: $$sumin"; \
+ echo " Calculated $$verifname: $$sum"; \
echo; exit 1
fi
fi