diff options
Diffstat (limited to 'reproduce/analysis')
-rw-r--r-- | reproduce/analysis/config/INPUTS.conf | 115 | ||||
-rw-r--r-- | reproduce/analysis/make/initialize.mk | 125 |
2 files changed, 172 insertions, 68 deletions
diff --git a/reproduce/analysis/config/INPUTS.conf b/reproduce/analysis/config/INPUTS.conf index 75e24de..5970ae3 100644 --- a/reproduce/analysis/config/INPUTS.conf +++ b/reproduce/analysis/config/INPUTS.conf @@ -1,9 +1,10 @@ # This project's input file information (metadata). # # For each input (external) data file that is used within the project, -# three variables are suggested here (two of them are mandatory). These -# variables will be used by 'reproduce/analysis/make/download.mk' to import -# the dataset into the project (within the build directory): +# three variables are suggested here (only the verification variable is +# strictly mandatory). These variables will be used by the download rule of +# 'reproduce/analysis/make/initialize.mk' to import the dataset into the +# project (within the build directory): # # - If the file already exists locally in '$(INDIR)' (the optional input # directory that may have been specified at configuration time with @@ -12,27 +13,53 @@ # files are large. # # - If the file doesn't exist in '$(INDIR)', or no input directory was -# specified at configuration time, then the file is downloaded from a -# specific URL. +# specified at configuration time, then the file is downloaded from the +# specified URL for that dataset. # # In both cases, before placing the file (or its link) in the build -# directory, 'reproduce/analysis/make/download.mk' will check the SHA256 -# checksum of the dataset and if it differs from the pre-defined value (set -# for that file, here), it will abort (since this is not the intended -# dataset). -# -# Therefore, the two variables specifying the URL and SHA256 checksum of -# the file are MANDATORY. The third variable (INPUT-%-size) showing the -# human-readable size of the file (from 'ls -lh') is optional (but -# recommended: because it gives future scientists to get a feeling of the -# volume of data they need to input: will become important if the -# size/number of files is large). +# directory, the download rule of 'reproduce/analysis/make/initialize.mk' +# will check the verification of the dataset and if it differs from the +# pre-defined value (set for that file, here), it will abort (since this is +# not the intended dataset). # +# Verification (two modes) +# ------------------------ +# - SHA256 checksum. This will check the full contents of the file, and +# is generic to any data format. However, if the server inserts custom +# headers like the query date or query code and etc, this form of +# validation is not useful: because every download will have different +# headers. In such cases, you should use the other verification methods +# below. In other words, this method is only good for files that are +# "static" on the server (and left there unchanged). If the file is +# generated at request time, the server usually inserts custom run-time +# dependent headers; making it impossible to verify with an SHA +# checksum of the whole file. +# - The FITS Standard's 'DATASUM' (which will only check the data, not +# the headers). According to the FITS standard, this sum ignores all +# headers, and is only calculated on a HDU's data. By default, this +# will require Gnuastro (which can easily calculate and return the +# value on the command-line), and it assumes HDU number 1 (counting +# from 0). You can modify the defaults by modifying the rule in +# 'reproduce/analysis/make/initialize.mk'. +# +# Automatic writing of verification +# --------------------------------- +# In case you would like Maneage to find the checksum upon downloading, put +# the string '--auto-replace--' instead of a checksum. This can be helpful +# for large datasets; where downloading only for adding the checksum is not +# easy/possible and can be buggy. In this scenario, upon downloading the +# file its checksum will be calculated and will be replaced with the +# '--auto-replace--' in this file. But since this file is under version +# control, be sure to commit all the updated checksums after your downloads +# are finished! +# +# Variable description +# -------------------- # The naming convension is critical for the input files to be properly -# imported into the project. In the patterns below, the '%' is the full -# file name (including its suffix): for example in the demo input of this -# file in the 'maneage' branch, we have 'INPUT-wfpc2.fits-sha256': -# therefore, the input file (within the project's '$(indir)') is called +# imported into Maneage. In the patterns below, the '%' is the full file +# name (including its suffix): for example in the demo input of this file +# in the 'maneage' branch, we have 'INPUT-wfpc2.fits-sha256': therefore, +# the input file (within the project's '$(indir)') is called # 'wfpc2.fits'. This allows you to simply set '$(indir)/wfpc2.fits' as the # pre-requisite of any recipe that needs the input file: you will rarely # (if at all!) need to use these variables directly. @@ -40,23 +67,17 @@ # INPUT-%-sha256: The sha256 checksum of the file. You can generate the # SHA256 checksum of a file with the 'sha256sum FILENAME' # command (where 'FILENAME' is the name of your -# file). this is very important for an automatic -# verification of the file: that it hasn't changed -# between different runs of the project (locally or in -# the URL). There are more robust checksum algorithms -# like the 'SHA' standards. -# -# AUTOMATIC CHEKSUM CALCULATION: In case you would like -# Maneage to find the checksum upon downloading, put the -# string '--auto-replace--' instead of a checksum. This -# can be helpful for large datasets; where downloading -# only for adding the checksum is not easy/possible and -# can be buggy. In this scenario, upon downloading the -# file its checksum will be calculated and will be -# replaced with the '--auto-replace--' in this file. But -# since this file is under version control, be sure to -# commit all the updated checksums after your downloads -# are finished! +# file). Don't use this if you give the 'fitsdatasum' +# keyvalue. +# +# INPUT-%-fitsdatasum: The FITS standard DATASUM value for HDU number 1 +# of the FITS file (counting from 0). Don't use this +# if you give the 'sha256' keyword. +# +# INPUT-%-fitshdu: The HDU identifier (counter from 0, or name) to use +# for the verification. This is only relevant in the +# 'fitsdatasum' verification method and optional (if not +# given, HDU number 1 is used; counting from 0). # # INPUT-%-url: The URL to download the file if it is not available # locally. It can happen that during the first phases of @@ -70,6 +91,13 @@ # good feeling of the necessary network and storage # capacity that is necessary to start the project. # +# Therefore, the the verification variable is MANDATORY in any case. The +# variable with a URL is only necessary if you do not have the file +# locally. However, The size variable is optional (but recommended: because +# it gives future scientists a feeling of the volume of data they need to +# input to run your project: will become important if the size/number of +# files is large). +# # The input dataset's name (that goes into the '%') can be different from # the URL's file name (last component of the URL, after the last '/'). Just # note that it is assumed that the local copy (outside of your project) is @@ -87,7 +115,18 @@ -# Demo dataset used in the histogram plot (remove when customizing). +# Demo dataset used in the histogram plot +# --------------------------------------- +# +# Remove this part while you are entering your project's datasets. +# +# Since the demonstration dataset is a FITS file, we have also added the +# two '$(INPUT-%-fits*)' variables as a demonstration. But they are +# commented because the SHA256 method is also possible for this file (its +# not generated on the server at query time; it is a static file on the +# server). INPUT-wfpc2.fits-size = 62K INPUT-wfpc2.fits-url = https://fits.gsfc.nasa.gov/samples/WFPC2ASSNu5780205bx.fits INPUT-wfpc2.fits-sha256 = 9851bc2bf9a42008ea606ec532d04900b60865daaff2f233e5c8565dac56ad5f +#INPUT-wfpc2.fits-fitshdu = 0 +#INPUT-wfpc2.fits-fitsdatasum = 2218330266 diff --git a/reproduce/analysis/make/initialize.mk b/reproduce/analysis/make/initialize.mk index eb5f2ff..7314184 100644 --- a/reproduce/analysis/make/initialize.mk +++ b/reproduce/analysis/make/initialize.mk @@ -63,15 +63,18 @@ pconfdir = reproduce/analysis/config # loaded. # # If your project doesn't need any preparation, you can ignore this. +# +# The '-' behind the include commands is used for adding the files only if +# it is possible (they exist). This is necessary because sometimes the user +# will have only '*.conf' or '*.mk' files, or with 'make clean' (where the +# preparation Makefile may call initialize.mk before the main +# 'top-make.mk'). If the '-' is not used, Make will complain about not +# finding these files. ifeq (x$(project-phase),xprepare) $(prepdir):; mkdir $@ else -include $(bsdir)/preparation-done.mk +-include $(bsdir)/preparation-done.mk ifeq (x$(include-prepare-results),xyes) -# The '-' behind the include is The '-' is used for adding the files only -# if it is possible (they exist). This is necessary because sometimes the -# user will have only '*.conf' or '*.mk' files. So, if the '-' is not used, -# Make will complain about not finding these files. -include $(prepdir)/*.mk $(prepdir)/*.conf endif endif @@ -227,7 +230,7 @@ project-commit-hash := $(shell \ export LD_LIBRARY_PATH="$(installdir)/lib"; \ echo $$($(installdir)/bin/git describe --dirty --always --long); \ else echo NOGIT; fi) -project-package-name := maneaged-$(project-commit-hash) +project-package-name = maneaged-$(project-commit-hash) project-package-contents = $(texdir)/$(project-package-name) @@ -438,13 +441,15 @@ dist-software: -# Download input data -# -------------------- +# Import input data +# ----------------- # -# 'reproduce/analysis/config/INPUTS.conf' contains the input dataset -# properties. In most cases, you will not need to edit this rule. Simply -# follow the instructions of 'INPUTS.conf' and set the variables names -# according to the described standards and everything should be fine. +# The list files to be imported (downloaded from a server, or linked from a +# local location), are listed in 'reproduce/analysis/config/INPUTS.conf' +# along with their URLs and verification checksums. In most cases, you will +# not need to edit this rule. Simply follow the instructions at the top of +# 'INPUTS.conf' and set the variables names according to the described +# standards and everything should be fine. # # TECHNICAL NOTE on the '$(foreach, n ...)' loop of 'inputdatasets': we are # using several (relatively complex!) features particular to Make: In GNU @@ -465,16 +470,60 @@ dist-software: # progress at every moment. $(indir):; mkdir $@ downloadwrapper = $(bashdir)/download-multi-try -inputdatasets = $(foreach i, \ - $(patsubst INPUT-%-sha256,%, \ - $(filter INPUT-%-sha256,$(.VARIABLES))), \ - $(indir)/$(i)) +inputdatasets := $(foreach i, \ + $(patsubst INPUT-%-sha256,%, \ + $(filter INPUT-%-sha256,$(.VARIABLES))) \ + $(patsubst INPUT-%-fitsdatasum,%, \ + $(filter INPUT-%-fitsdatasum,$(.VARIABLES))), \ + $(indir)/$(i)) $(inputdatasets): $(indir)/%: | $(indir) $(lockdir) -# Set the necessary parameters for this input file as shell variables -# (to help in readability). - url=$(INPUT-$*-url) - sha=$(INPUT-$*-sha256) +# Starting rule with '@': In case there is a username or password +# given for the database, we don't want the values to be printed in +# the terminal as the pipeline is running. We are therefore starting +# this recipe with an '@' (so Make doesn't print the used +# commands). To help the user know what is happening (in case they +# can't tell from the Wget outputs), we'll just start the recipe with +# a notice on what is being imported. + @echo "Importing $@" + +# If a username or password has been provided, add them to the WGET +# command. The two variables are defined in the local configuation +# file 'reproduce/software/config/LOCAL.conf' that is not under +# version control. Different servers may use different authentication +# formats. If the default one doesn't work for your server, comment +# it and uncomment the one that works. If your serve needs a +# different kind of authentication format, please add it yourself. In +# case you need a new format, we encourage you to send the format to +# us using the link below: +# https://savannah.nongnu.org/support/?group=reproduce&func=additem + authopt="" + if [ x"$(DATABASEAUTHTYPE)" != x ]; then + case "$(DATABASEAUTHTYPE)" in + +# Format: '--user=XXXX --password=YYYY' + userpass) + if [ x'$(DATABASEUSER)' != x ]; then + authopt="--user='$(DATABASEUSER)'"; fi + if [ x'$(DATABASEPASS)' != x ]; then + authopt="$$authopt --password='$(DATABASEPASS)'"; fi + ;; + +# Format: --post-data 'username=XXXX&password=YYYY' + postdata) + if [ x'$(DATABASEUSER)' != x ]; then + authopt="--post-data 'username=$(DATABASEUSER)"; fi + if [ x'$(DATABASEPASS)' != x ]; then + authopt="$$authopt""&password=$(DATABASEPASS)'"; + else authopt="$$authopt'" # To close the single quote + fi + ;; + +# Unrecognized format. + *) + echo "Maneage: 'DATABASEAUTHTYPE' format not recognized! Please see the description of this variable in 'reproduce/software/config/LOCAL.conf' for the acceptable values."; exit 1;; + esac + fi # Download (or make the link to) the input dataset. If the file # exists in 'INDIR', it may be a symbolic link to some other place in @@ -488,13 +537,29 @@ $(inputdatasets): $(indir)/%: | $(indir) $(lockdir) ln -fs $$(readlink -f $(INDIR)/$*) $$unchecked else touch $(lockdir)/download - $(downloadwrapper) "wget --no-use-server-timestamps -O" \ - $(lockdir)/download $$url $$unchecked + $(downloadwrapper) "wget $$authopt --no-use-server-timestamps -O" \ + $(lockdir)/download $(INPUT-$*-url) $$unchecked + fi + +# Set the checksum related variables. + if [ x"$(INPUT-$*-sha256)" != x ]; then + suffix=sha256 + sumin=$(INPUT-$*-sha256) + verifname="SHA256 checksum" + sum=$$(sha256sum $$unchecked | awk '{print $$1}') + elif [ x"$(INPUT-$*-fitsdatasum)" != x ]; then + suffix=fitsdatasum + sumin=$(INPUT-$*-fitsdatasum) + verifname="FITS standard DATASUM" + if [ x"$(INPUT-$*-fitshdu)" = x ]; then hdu=1; + else hdu="$(INPUT-$*-fitshdu)"; fi + sum=$$(astfits $$unchecked -h$$hdu --datasum | awk '{print $$1}') + else + echo "$@: checksum for verifyication not recognized!"; exit 1 fi -# Check the checksum to see if this is the proper dataset. - sum=$$(sha256sum $$unchecked | awk '{print $$1}') - if [ $$sum = $$sha ]; then +# Verify the input. + if [ $$sum = $$sumin ]; then mv $$unchecked $@ echo "Integrity confirmed, using $@ in this project." @@ -502,11 +567,11 @@ $(inputdatasets): $(indir)/%: | $(indir) $(lockdir) else # The user has asked to update the checksum in 'INPUTS.conf'. - if [ $$sha = "--auto-replace--" ]; then + if [ $$sumin = "--auto-replace--" ]; then # Put the updated 'INPUTS.conf' in a temporary file. inputstmp=$@.inputs - awk '{if($$1 == "INPUT-$*-sha256") \ + awk '{if($$1 == "INPUT-$*-'$$suffix'") \ $$3="'$$sum'"; print}' \ $(pconfdir)/INPUTS.conf > $$inputstmp @@ -520,10 +585,10 @@ $(inputdatasets): $(indir)/%: | $(indir) $(lockdir) # Error on non-matching checksums. else echo; echo; - echo "Wrong SHA256 checksum for input file '$*':" + echo "Wrong $$verifname for input file '$*':" echo " File location: $$unchecked"; \ - echo " Expected SHA256 checksum: $$sha"; \ - echo " Calculated SHA256 checksum: $$sum"; \ + echo " Expected $$verifname: $$sumin"; \ + echo " Calculated $$verifname: $$sum"; \ echo; exit 1 fi fi |