2 files changed, 99 insertions, 74 deletions
diff --git a/reproduce/analysis/config/INPUTS.conf b/reproduce/analysis/config/INPUTS.conf
index 936f5f9..5a58758 100644
--- a/reproduce/analysis/config/INPUTS.conf
+++ b/reproduce/analysis/config/INPUTS.conf
@@ -1,40 +1,68 @@
-# Input files necessary for this project, the variables defined in this
-# file are primarily used in 'reproduce/analysis/make/download.mk'. See
-# there for precise usage of the variables. But comments are also provided
-# here.
-#
-# Necessary variables for each input dataset are listed below. Its good
-# that all the variables of each file have the same base-name (in the
-# example below 'DEMO') with descriptive suffixes, also put a short comment
-# above each group of variables for each dataset, shortly explaining what
-# it is.
-#
-#  1) Local file name ('DEMO-DATA' below): this is the name of the dataset
-#     on the local system (in 'INDIR', given at configuration time). It is
-#     recommended that it be the same name as the online version of the
-#     file like the case here (note how this variable is used in 'DEMO-URL'
-#     for the dataset's full URL). However, this is not always possible, so
-#     the local and server filenames may be different. Ultimately, the file
-#     name is irrelevant, we check the integrity with the checksum.
-#
-#  2) The MD5 checksum of the file ('DEMO-MD5' below): this is very
-#     important for an automatic verification of the file. You can
-#     calculate it by running 'md5sum' on your desired file. You can also
-#     use any other checksum tool that you prefer, just be sure to correct
-#     the respective command in 'reproduce/analysis/make/download.mk'.
-#
-#  3) The human-readable size of the file ('DEMO-SIZE' below): this is an
-#     optional variable, mainly to help a reader of your project get a
-#     sense of the volume they need to download if they don't already have
-#     the dataset. So it is highly recommended to add it (future readers of
-#     your project's source will appreciate it!). You can get it from the
-#     output of 'ls -lh' command on the file. Optionally you can use it in
-#     messages during the configuration phase (when Maneage asks for the
-#     input data directory), along with other info about the file(s).
-#
-#  4) The full dataset URL ('DEMO-URL' below): this is the full URL
-#     (including the file-name) that can be used to download the dataset
-#     when necessary. Also, see the description above on local filename.
+# This project's input file information (metadata).
+#
+# For each input (external) data file that is used within the project,
+# three variables are suggested here (two of them are mandatory). These
+# variables will be used by 'reproduce/analysis/make/download.mk' to import
+# the dataset into the project (within the build directory):
+#
+#   - If the file already exists locally in '$(INDIR)' (the optional input
+#     directory that may have been specified at configuration time with
+#     '--input-dir'), a symbolic link will be added in '$(indir)' (in the
+#     build directory). A symbolic link is used to avoid extra storage when
+#     files are large.
+#
+#   - If the file doesn't exist in '$(INDIR)', or no input directory was
+#     specified at configuration time, then the file is downloaded from a
+#     specific URL.
+#
+# In both cases, before placing the file (or its link) in the build
+# directory, 'reproduce/analysis/make/download.mk' will check the SHA256
+# checksum of the dataset and if it differs from the pre-defined value (set
+# for that file, here), it will abort (since this is not the intended
+# dataset).
+#
+# Therefore, the two variables specifying the URL and SHA256 checksum of
+# the file are MANDATORY. The third variable (INPUT-%-size) showing the
+# human-readable size of the file (from 'ls -lh') is optional (but
+# recommended: because it gives future scientists to get a feeling of the
+# volume of data they need to input: will become important if the
+# size/number of files is large).
+#
+# The naming convension is critical for the input files to be properly
+# imported into the project. In the patterns below, the '%' is the full
+# file name (including its prefix): for example in the demo input of this
+# file in the 'maneage' branch, we have 'INPUT-wfpc2.fits-sha256':
+# therefore, the input file (within the project's '$(indir)') is called
+# 'wfpc2.fits'. This allows you to simply set '$(indir)/wfpc2.fits' as the
+# pre-requisite of any recipe that needs the input file: you will rarely
+# (if at all!) need to use these variables directly.
+#
+#   INPUT-%-sha256: The sha256 checksum of the file. You can generate the
+#                   SHA256 checksum of a file with the 'sha256sum FILENAME'
+#                   command (where 'FILENAME' is the name of your
+#                   file). this is very important for an automatic
+#                   verification of the file: that it hasn't changed
+#                   between different runs of the project (locally or in
+#                   the URL). There are more robust checksum algorithms
+#                   like the 'SHA' standards.
+#
+#   INPUT-%-url: The URL to download the file if it is not available
+#                locally. It can happen that during the first phases of
+#                your project the data aren't yet public. In this case, you
+#                set a phony URL like this (just as a clear place-holder):
+#                'https://this.file/is/not/yet/public'.
+#
+#   INPUT-%-size: The human-readable size of the file (output of 'ls
+#                 -lh'). This is not used by default but can help other
+#                 scientists who would like to run your project get a
+#                 good feeling of the necessary network and storage
+#                 capacity that is necessary to start the project.
+#
+# The input dataset's name (that goes into the '%') can be different from
+# the URL's file name (last component of the URL, after the last '/'). Just
+# note that it is assumed that the local copy (outside of your project) is
+# also called '%' (if your local copy of the input dataset and the only
+# repository names are the same, be sure to set '%' accordingly).
 #
 # Copyright (C) 2018-2022 Mohammad Akhlaghi <mohammad@akhlaghi.org>
 #
@@ -48,7 +76,6 @@
 
 
 # Demo dataset used in the histogram plot (remove when customizing).
-DEMO-DATA = WFPC2ASSNu5780205bx.fits
-DEMO-MD5  = a4791e42cd1045892f9c41f11b50bad8
-DEMO-SIZE = 62K
-DEMO-URL  = https://fits.gsfc.nasa.gov/samples/$(DEMO-DATA)
+INPUT-wfpc2.fits-size = 62K
+INPUT-wfpc2.fits-url  = https://fits.gsfc.nasa.gov/samples/WFPC2ASSNu5780205bx.fits
+INPUT-wfpc2.fits-sha256 = 9851bc2bf9a42008ea606ec532d04900b60865daaff2f233e5c8565dac56ad5f
diff --git a/reproduce/analysis/make/download.mk b/reproduce/analysis/make/download.mk
index e652c17..6e67962 100644
--- a/reproduce/analysis/make/download.mk
+++ b/reproduce/analysis/make/download.mk
@@ -27,22 +27,20 @@
 # Download input data
 # --------------------
 #
-# The input dataset properties are defined in
-# '$(pconfdir)/INPUTS.conf'. For this template we only have one dataset to
-# enable easy processing, so all the extra checks in this rule may seem
-# redundant.
+# 'reproduce/analysis/config/INPUTS.conf' contains the input dataset
+# properties. In most cases, you will not need to edit this rule (or
+# file!). Simply follow the instructions of 'INPUTS.conf' and set the
+# variables names according to the described standards.
 #
-# In a real project, you will need more than one dataset. In that case,
-# just add them to the target list and add an 'elif' statement to define it
-# in the recipe.
-#
-# Files in a server usually have very long names, which are mainly designed
-# for helping in data-base management and being generic. Since Make uses
-# file names to identify which rule to execute, and the scope of this
-# research project is much less than the generic survey/dataset, it is
-# easier to have a simple/short name for the input dataset and work with
-# that. In the first condition of the recipe below, we connect the short
-# name with the raw database name of the dataset.
+# TECHNICAL NOTE on the '$(foreach, n ...)' loop of 'inputdatasets': we are
+# using several (relatively complex!) features particular to Make: In GNU
+# Make, '.VARIABLES' "... expands to a list of the names of all global
+# variables defined so far" (from the "Other Special Variables" section of
+# the GNU Make manual). Assuming that the pattern 'INPUT-%-sha256' is only
+# used for input files, we find all the variables that contain the input
+# file name (the '%' is the filename). Finally, using the
+# pattern-substitution function ('patsubst'), we remove the fixed string at
+# the start and end of the variable name.
 #
 # Download lock file: Most systems have a single connection to the
 # internet, therefore downloading is inherently done in series. As a
@@ -53,16 +51,16 @@
 # progress at every moment.
 $(indir):; mkdir $@
 downloadwrapper = $(bashdir)/download-multi-try
-inputdatasets = $(foreach i, wfpc2, $(indir)/$(i).fits)
-$(inputdatasets): $(indir)/%.fits: | $(indir) $(lockdir)
+inputdatasets = $(foreach i, \
+                  $(patsubst INPUT-%-sha256,%, \
+                    $(filter INPUT-%-sha256,$(.VARIABLES))), \
+                  $(indir)/$(i))
+$(inputdatasets): $(indir)/%: | $(indir) $(lockdir)
 
-#	Set the necessary parameters for this input file.
-	if   [ $* = wfpc2 ]; then
-	  localname=$(DEMO-DATA); url=$(DEMO-URL); mdf=$(DEMO-MD5);
-	else
-	echo; echo; echo "Not recognized input dataset: '$*.fits'."
-	echo; echo; exit 1
-	fi
+#	Set the necessary parameters for this input file as shell variables
+#	(to help in readability).
+	url=$(INPUT-$*-url)
+	sha=$(INPUT-$*-sha256)
 
 #	Download (or make the link to) the input dataset. If the file
 #	exists in 'INDIR', it may be a symbolic link to some other place in
@@ -72,25 +70,25 @@ $(inputdatasets): $(indir)/%.fits: | $(indir) $(lockdir)
 #	GNU Coreutils). If its not a link, the 'readlink' part has no
 #	effect.
 	unchecked=$@.unchecked
-	if [ -f $(INDIR)/$$localname ]; then
-	  ln -fs $$(readlink -f $(INDIR)/$$localname) $$unchecked
+	if [ -f $(INDIR)/$* ]; then
+	  ln -fs $$(readlink -f $(INDIR)/$*) $$unchecked
 	else
 	  touch $(lockdir)/download
 	  $(downloadwrapper) "wget --no-use-server-timestamps -O" \
 	                     $(lockdir)/download $$url $$unchecked
 	fi
 
-#	Check the md5 sum to see if this is the proper dataset.
-	sum=$$(md5sum $$unchecked | awk '{print $$1}')
-	if [ $$sum = $$mdf ]; then
+#	Check the checksum to see if this is the proper dataset.
+	sum=$$(sha256sum $$unchecked | awk '{print $$1}')
+	if [ $$sum = $$sha ]; then
 	  mv $$unchecked $@
 	  echo "Integrity confirmed, using $@ in this project."
 	else
 	  echo; echo;
-	  echo "Wrong MD5 checksum for input file '$$localname':"
+	  echo "Wrong SHA256 checksum for input file '$*':"
 	  echo "  File location: $$unchecked"; \
-	  echo "  Expected MD5 checksum:   $$mdf"; \
-	  echo "  Calculated MD5 checksum: $$sum"; \
+	  echo "  Expected SHA256 checksum:   $$sha"; \
+	  echo "  Calculated SHA256 checksum: $$sum"; \
 	  echo; exit 1
 	fi
 
@@ -104,4 +102,4 @@ $(inputdatasets): $(indir)/%.fits: | $(indir) $(lockdir)
 # It is very important to mention the address where the data were
 # downloaded in the final report.
 $(mtexdir)/download.tex: $(pconfdir)/INPUTS.conf | $(mtexdir)
-	echo "\\newcommand{\\wfpctwourl}{$(DEMO-URL)}" > $@
+	echo "\\newcommand{\\wfpctwourl}{$(INPUT-wfpc2.fits-url)}" > $@