From 623ae15c95bb8575b111709705c29b10fcf7c12b Mon Sep 17 00:00:00 2001
From: Mohammad Akhlaghi <mohammad@akhlaghi.org>
Date: Tue, 2 Jun 2020 03:45:46 +0100
Subject: IMPORTANT: Added publication checklist, improved relevant
 infrastructure

Possible semantic conflicts (that may not show up as Git conflicts but may
cause a crash in your project after the merge):

   1) The project title (and other basic metadata) should be set in
      'reproduce/analysis/conf/metadata.conf'. Please include this file in
      your merge (if it is ignored because of '.gitattributes'!).

   2) Consider importing the changes in 'initialize.mk' and 'verify.mk' (if
      you have added all analysis Makefiles to the '.gitattributes' file
      (thus not merging any change in them with your branch). For example
      with this command:
        git diff master...maneage -- reproduce/analysis/make/initialize.mk

   3) The old 'verify-txt-no-comments-leading-space' function has been
      replaced by 'verify-txt-no-comments-no-space'. The new function will
      also remove all white-space characters between the columns (not just
      white space characters at the start of the line). Thus the resulting
      check won't involve spacing between columns.

A common set of steps are always necessary to prepare a project for
publication. Until now, we would simply look at previous submissions and
try to follow them, but that was prone to errors and could cause
confusion. The internal infrastructure also didn't have some useful
features to make good publication possible. Now that the submission of a
paper fully devoted to the founding criteria of Maneage is complete
(arXiv:2006.03018), it was time to formalize the necessary steps for easier
submission of a project using Maneage and implement some low-level features
that can make things easier.

With this commit a first draft of the publication checklist has been added
to 'README-hacking.md', it was tested in the submission of arXiv:2006.03018
and zenodo.3872248. To help guide users on implementing the good practices
for output datasets, the outputs of the default project shown in the paper
now use the new features). After reading the checklist, please inspect
these.

Some other relevant changes in this commit:

  - The publication involves a copy of the necessary software
    tarballs. Hence a new target ('dist-software') was also added to
    package all the project's software tarballs in one tarball for easy
    distribution.

  - A new 'dist-lzip' target has been defined for those who want to
    distribute an Lzip-compressed tarball.

  - The '\includetikz' LaTeX macro now has a second argument to allow
    configuring the '\includegraphics' call when the plot should not be
    built, but just imported.
---
 reproduce/analysis/make/delete-me.mk  |  98 ++++++++++++++++--------
 reproduce/analysis/make/initialize.mk | 135 ++++++++++++++++++++++++++++------
 reproduce/analysis/make/verify.mk     |  49 ++++++++----
 3 files changed, 216 insertions(+), 66 deletions(-)

(limited to 'reproduce/analysis/make')

diff --git a/reproduce/analysis/make/delete-me.mk b/reproduce/analysis/make/delete-me.mk
index fa16102..f45f9ea 100644
--- a/reproduce/analysis/make/delete-me.mk
+++ b/reproduce/analysis/make/delete-me.mk
@@ -22,18 +22,40 @@
 # Dummy dataset
 # -------------
 #
-# We will use AWK to generate a table showing X and X^2 and draw its plot.
-delete-numdir = $(texdir)/delete-me-num
-delete-num    = $(delete-numdir)/data.txt
-$(delete-numdir): | $(texdir); mkdir $@
-$(delete-num): $(pconfdir)/delete-me-num.conf | $(delete-numdir)
+# Just as a demonstration(!): we will use AWK to generate a table showing X
+# and X^2 and draw its plot.
+#
+# Note that this dataset is directly read by LaTeX to generate a plot, so
+# we need to put it in the $(tex-publish-dir) directory.
+dm-squared = $(tex-publish-dir)/squared.txt
+$(dm-squared): $(pconfdir)/delete-me-squared-num.conf | $(tex-publish-dir)
 
         # When the plotted values are re-made, it is necessary to also
-        # delete the TiKZ externalized files so the plot is also re-made.
-	rm -f $(tikzdir)/delete-me.pdf
+        # delete the TiKZ externalized files so the plot is also re-made by
+        # PGFPlots.
+	rm -f $(tikzdir)/delete-me-squared.pdf
+
+        # Write the column metadata in a temporary file name (appending
+        # '.tmp' to the actual target name). Once all steps are done, it is
+        # renamed to the final target. We do this because if there is an
+        # error in the middle, Make will not consider the job to be
+        # complete and will stop here.
+	echo "# Data for demonstration plot of default Maneage (MANaging data linEAGE)." > $@.tmp
+	echo "# It is a simple plot, showing the power of two: y=x^2! " >> $@.tmp
+	echo "# " >> $@.tmp
+	echo "# Column 1: X       [arbitrary, f32] The horizontal axis numbers." \
+	     >> $@.tmp
+	echo "# Column 2: X_POW2  [arbitrary, f32] The horizontal axis to the power of two." \
+	     >> $@.tmp
+	echo "# " >> $@.tmp
+	$(call print-copyright, $@.tmp)
 
         # Generate the table of random values.
-	awk 'BEGIN {for(i=1;i<=$(delete-me-num);i+=0.5) print i, i*i; }' > $@
+	awk 'BEGIN {for(i=1;i<=$(delete-me-squared-num);i+=0.5) \
+	              printf("%-8.1f%.2f\n", i, i*i); }' >> $@.tmp
+
+        # Write it into the final target
+	mv $@.tmp $@
 
 
 
@@ -44,14 +66,14 @@ $(delete-num): $(pconfdir)/delete-me-num.conf | $(delete-numdir)
 #
 # For an example image, we'll make a PDF copy of the WFPC II image to
 # display in the paper.
-delete-demodir = $(texdir)/delete-me-demo
-$(delete-demodir): | $(texdir); mkdir $@
-delete-pdf = $(delete-demodir)/wfpc2.pdf
-$(delete-pdf): $(delete-demodir)/%.pdf: $(indir)/%.fits | $(delete-demodir)
+dm-histdir = $(texdir)/image-histogram
+$(dm-histdir): | $(texdir); mkdir $@
+dm-img-pdf = $(dm-histdir)/wfpc2.pdf
+$(dm-img-pdf): $(dm-histdir)/%.pdf: $(indir)/%.fits | $(dm-histdir)
 
         # When the plotted values are re-made, it is necessary to also
         # delete the TiKZ externalized files so the plot is also re-made.
-	rm -f $(tikzdir)/delete-me-wfpc2.pdf
+	rm -f $(tikzdir)/delete-me-image-histogram.pdf
 
         # Convert the dataset to a PDF.
 	astconvertt --colormap=gray --fluxhigh=4 $< -h0 -o$@
@@ -63,17 +85,35 @@ $(delete-pdf): $(delete-demodir)/%.pdf: $(indir)/%.fits | $(delete-demodir)
 # Histogram of WFPC2 image
 # ------------------------
 #
-# For an example plot, we'll show the pixel value histogram also.
-delete-histogram = $(delete-demodir)/wfpc2-hist.txt
-$(delete-histogram): $(delete-demodir)/%-hist.txt: $(indir)/%.fits \
-                     | $(delete-demodir)
+# For an example plot, we'll show the pixel value histogram also. IMPORTANT
+# NOTE: because this histogram contains data that is included in a plot, we
+# should publish it, so it will go into the $(tex-publish-dir).
+dm-img-histogram = $(tex-publish-dir)/wfpc2-histogram.txt
+$(dm-img-histogram): $(tex-publish-dir)/%-histogram.txt: $(indir)/%.fits \
+                     | $(tex-publish-dir)
 
         # When the plotted values are re-made, it is necessary to also
         # delete the TiKZ externalized files so the plot is also re-made.
-	rm -f $(tikzdir)/delete-me-wfpc2.pdf
+	rm -f $(tikzdir)/delete-me-image-histogram.pdf
+
+        # Generate the pixel value histogram.
+	aststatistics --lessthan=5 $< -h0 --histogram -o$@.data
+
+        # Put a two-line description of the dataset, copy the column
+        # metadata from '$@.data', and add copyright.
+	echo "# Histogram of example image to demonstrate Maneage (MANaging data linEAGE)." \
+	     > $@.tmp
+	echo "# Example image URL: $(WFPC2URL)/$(WFPC2IMAGE)" >> $@.tmp
+	echo "# " >> $@.tmp
+	awk '/^# Column .:/' $@.data >> $@.tmp
+	echo "# " >> $@.tmp
+	$(call print-copyright, $@.tmp)
 
-        # Generate the pixel value distribution
-	aststatistics --lessthan=5 $< -h0 --histogram -o$@
+        # Add the column numbers in a formatted manner, rename it to the
+        # output and clean up.
+	awk '!/^#/{printf("%-15.4f%d\n", $$1, $$2)}' $@.data >> $@.tmp
+	mv $@.tmp $@
+	rm $@.data
 
 
 
@@ -84,9 +124,9 @@ $(delete-histogram): $(delete-demodir)/%-hist.txt: $(indir)/%.fits \
 #
 # This is just as a demonstration on how to get analysic configuration
 # parameters from variables defined in `reproduce/analysis/config/'.
-delete-stats = $(delete-demodir)/wfpc2-stats.txt
-$(delete-stats): $(delete-demodir)/%-stats.txt: $(indir)/%.fits \
-                 | $(delete-demodir)
+dm-img-stats = $(dm-histdir)/wfpc2-stats.txt
+$(dm-img-stats): $(dm-histdir)/%-stats.txt: $(indir)/%.fits \
+                 | $(dm-histdir)
 	aststatistics $< -h0 --mean --median > $@
 
 
@@ -100,11 +140,11 @@ $(delete-stats): $(delete-demodir)/%-stats.txt: $(indir)/%.fits \
 #
 # NOTE: In LaTeX you cannot use any non-alphabetic character in a variable
 # name.
-$(mtexdir)/delete-me.tex: $(delete-num) $(delete-pdf) $(delete-histogram) \
-                          $(delete-stats)
+$(mtexdir)/delete-me.tex: $(dm-squared) $(dm-img-pdf) $(dm-img-histogram) \
+                          $(dm-img-stats)
 
         # Write the number of random values used.
-	echo "\newcommand{\deletemenum}{$(delete-me-num)}" > $@
+	echo "\newcommand{\deletemenum}{$(delete-me-squared-num)}" > $@
 
         # Note that since Make variables start with a `$(', if you want to
         # use `$' within the shell (not Make), you have to quote any
@@ -116,14 +156,14 @@ $(mtexdir)/delete-me.tex: $(delete-num) $(delete-pdf) $(delete-histogram) \
         # macro definition.
 	mm=$$(awk 'BEGIN{min=99999; max=-min}
 	           !/^#/{if($$2>max) max=$$2; if($$2<min) min=$$2;}
-	           END{print min, max}' $(delete-num));
+	           END{print min, max}' $(dm-squared));
 	v=$$(echo "$$mm" | awk '{printf "%.3f", $$1}');
 	echo "\newcommand{\deletememin}{$$v}"             >> $@
 	v=$$(echo "$$mm" | awk '{printf "%.3f", $$2}');
 	echo "\newcommand{\deletememax}{$$v}"             >> $@
 
         # Write the statistics of the WFPC2 image as a macro.
-	mean=$$(awk     '{printf("%.2f", $$1)}' $(delete-stats))
+	mean=$$(awk     '{printf("%.2f", $$1)}' $(dm-img-stats))
 	echo "\newcommand{\deletemewfpctwomean}{$$mean}"          >> $@
-	median=$$(awk   '{printf("%.2f", $$2)}' $(delete-stats))
+	median=$$(awk   '{printf("%.2f", $$2)}' $(dm-img-stats))
 	echo "\newcommand{\deletemewfpctwomedian}{$$median}"      >> $@
diff --git a/reproduce/analysis/make/initialize.mk b/reproduce/analysis/make/initialize.mk
index 4e317bb..19447a6 100644
--- a/reproduce/analysis/make/initialize.mk
+++ b/reproduce/analysis/make/initialize.mk
@@ -202,6 +202,16 @@ $(lockdir): | $(BDIR); mkdir $@
 
 
 
+# Version and distribution tarball definitions
+project-commit-hash := $(shell if [ -d .git ]; then \
+    echo $$(git describe --dirty --always --long); else echo NOGIT; fi)
+project-package-name := maneaged-$(project-commit-hash)
+project-package-contents = $(texdir)/$(project-package-name)
+
+
+
+
+
 # High-level Makefile management
 # ------------------------------
 #
@@ -212,11 +222,8 @@ $(lockdir): | $(BDIR); mkdir $@
 # we want to ensure that the file is always built in every run: it contains
 # the project version which may change between two separate runs, even when
 # no file actually differs.
-packagebasename := $(shell if [ -d .git ]; then \
-    echo paper-$$(git describe --dirty --always --long); else echo NOGIT; fi)
-packagecontents = $(texdir)/$(packagebasename)
-.PHONY: all clean dist dist-zip distclean clean-mmap $(packagecontents) \
-        $(mtexdir)/initialize.tex
+.PHONY: all clean dist dist-zip dist-lzip distclean clean-mmap \
+        $(project-package-contents) $(mtexdir)/initialize.tex
 
 # --------- Delete for no Gnuastro ---------
 clean-mmap:; rm -f reproduce/config/gnuastro/mmap*
@@ -260,11 +267,11 @@ distclean: clean
 # that is ready for building the final PDF with LaTeX. This is useful for
 # collaborators who only want to contribute to the text of your project,
 # without having to worry about the technicalities of the analysis.
-$(packagecontents): paper.pdf | $(texdir)
+$(project-package-contents): paper.pdf | $(texdir)
 
         # Set up the output directory, delete it if it exists and remake it
         # to fill with new contents.
-	dir=$(texdir)/$(packagebasename)
+	dir=$@
 	rm -rf $$dir
 	mkdir $$dir
 
@@ -298,7 +305,7 @@ $(packagecontents): paper.pdf | $(texdir)
 	cp -r tex/src                            $$dir/tex/src
 	cp tex/tikz/*.pdf                        $$dir/tex/tikz
 	cp -r reproduce/*                        $$dir/reproduce
-	cp -r tex/build/!(paper-v*)              $$dir/tex/build
+	cp -r tex/build/!($(project-package-name)) $$dir/tex/build
 
         # Clean up un-necessary/local files: 1) the $(texdir)/build*
         # directories (when building in a group structure, there will be
@@ -337,32 +344,113 @@ $(packagecontents): paper.pdf | $(texdir)
 
         # Clean temporary (currently those ending in `~') files.
 	cd $(texdir)
-	find $(packagebasename) -name \*~ -delete
-	find $(packagebasename) -name \*.swp -delete
+	find $(project-package-name) -name \*~ -delete
+	find $(project-package-name) -name \*.swp -delete
 
         # PROJECT SPECIFIC
         # ----------------
         # Put any project specific distribution steps here.
         # ----------------
 
-# Package into `.tar.gz'.
-dist: $(packagecontents)
+# Package into `.tar.gz' or '.tar.lz'.
+dist dist-lzip: $(project-package-contents)
 	curdir=$$(pwd)
 	cd $(texdir)
-	tar -cf $(packagebasename).tar $(packagebasename)
-	gzip -f --best $(packagebasename).tar
-	rm -rf $(packagebasename)
+	tar -cf $(project-package-name).tar $(project-package-name)
+	if [ $@ = dist ]; then
+	  suffix=gz
+	  gzip -f --best $(project-package-name).tar
+	elif [ $@ = dist-lzip ]; then
+	  suffix=lz
+	  lzip -f --best $(project-package-name).tar
+	fi
+	rm -rf $(project-package-name)
 	cd $$curdir
-	mv $(texdir)/$(packagebasename).tar.gz ./
+	mv $(texdir)/$(project-package-name).tar.$$suffix ./
 
 # Package into `.zip'.
-dist-zip: $(packagecontents)
+dist-zip: $(project-package-contents)
 	curdir=$$(pwd)
 	cd $(texdir)
-	zip -q -r $(packagebasename).zip $(packagebasename)
-	rm -rf $(packagebasename)
+	zip -q -r $(project-package-name).zip $(project-package-name)
+	rm -rf $(project-package-name)
+	cd $$curdir
+	mv $(texdir)/$(project-package-name).zip ./
+
+# Package the software tarballs.
+dist-software:
+	curdir=$$(pwd)
+	cd $(BDIR)
+	if [ -d .git ]; then
+	  dirname="software-$$(git describe --dirty --always --long)"
+	else
+	  dirname="software-NOGIT";
+	fi
+	mkdir $$dirname
+	cp -L software/tarballs/* $$dirname/
+	tar -cf $$dirname.tar $$dirname
+	gzip -f --best $$dirname.tar
+	rm -rf $$dirname
 	cd $$curdir
-	mv $(texdir)/$(packagebasename).zip ./
+	mv $(BDIR)/$$dir.tar.gz ./
+
+
+
+
+
+# Directory containing to-be-published datasets
+# ---------------------------------------------
+#
+# Its good practice (so you don't forget in the last moment!) to have all
+# the plot/figure/table data that you ultimately want to publish in a
+# single directory.
+#
+# There are two types of to-publish data in the project.
+#
+#  1. Those data that also go into LaTeX (for example to give to LateX's
+#     PGFPlots package to create the plot internally) should be under the
+#     '$(BDIR)/tex' directory (because other LaTeX producers may also need
+#     it for example when using './project make dist'). The contents of
+#     this directory are directly taken into the tarball.
+#
+#  2. The data that aren't included directly in the LaTeX run of the paper,
+#     can be seen as supplements. A good place to keep them is under your
+#     build-directory.
+#
+# RECOMMENDATION: don't put the figure/plot/table number in the names of
+# your to-be-published datasets! Given them a descriptive/short name that
+# would be clear to anyone who has read the paper. Later, in the caption
+# (or paper's tex/appendix), you will put links to the dataset on servers
+# like Zenodo (see the "Publication checklist" in 'README-hacking.md').
+tex-publish-dir = $(texdir)/to-publish
+data-publish-dir = $(BDIR)/data-to-publish
+$(tex-publish-dir):; mkdir $@
+$(data-publish-dir):; mkdir $@
+
+
+
+
+
+# Print Copyright statement
+# -------------------------
+#
+# This statement can be used in published datasets that are in plain-text
+# format. It assumes you have already put the data-specific statements in
+# its first argument, it will supplement them with general project links.
+print-copyright = \
+	echo "\# Project title: $(metadata-title)" >> $(1); \
+	echo "\# Git commit (that produced this dataset): $(project-commit-hash)" >> $(1); \
+	echo "\# Project's Git repository: $(metadata-git-repository)" >> $(1); \
+	if [ x$(metadata-arxiv) != x ]; then \
+	  echo "\# Pre-print server: arXiv:$(metadata-arxiv)" >> $(1); fi; \
+	if [ x$(metadata-doi-journal) != x ]; then \
+	  echo "\# DOI (Journal): $(metadata-doi-journal)" >> $(1); fi; \
+	if [ x$(metadata-doi-zenodo) != x ]; then \
+	echo "\# DOI (Zenodo): $(metadata-doi-zenodo)" >> $(1); fi; \
+	echo "\#" >> $(1); \
+	echo "\# Copyright (C) $$(date +%Y) $(metadata-copyright-owner)" >> $(1); \
+	echo "\# Dataset is available under $(metadata-copyright)." >> $(1); \
+	echo "\# License URL: $(metadata-copyright-url)" >> $(1);
 
 
 
@@ -377,7 +465,6 @@ dist-zip: $(packagecontents)
 # actually exists, it is also aded as a `.PHONY' target above.
 $(mtexdir)/initialize.tex: | $(mtexdir)
 
-        # Version of the project.
-	@if [ -d .git ]; then v=$$(git describe --dirty --always --long);
-	else                  v=NO-GIT; fi
-	echo "\newcommand{\projectversion}{$$v}" > $@
+        # Version and title of project.
+	echo "\newcommand{\projecttitle}{$(metadata-title)}" > $@
+	echo "\newcommand{\projectversion}{$(project-commit-hash)}" >> $@
diff --git a/reproduce/analysis/make/verify.mk b/reproduce/analysis/make/verify.mk
index 43d1472..67b3fea 100644
--- a/reproduce/analysis/make/verify.mk
+++ b/reproduce/analysis/make/verify.mk
@@ -40,22 +40,34 @@ verify-print-tips = \
   echo "the following project source file:"; \
   echo "    reproduce/analysis/make/verify.mk"
 
-verify-txt-no-comments-leading-space = \
+# Removes following components of a plain-text file, calculates checksum
+# and compares with given checksum:
+#   - All commented lines (starting with '#') are removed.
+#   - All empty lines are removed.
+#   - All space-characters in remaining lines are removed (so the width of
+#     the printed columns won't invalidate the verification).
+#
+# It takes three arguments:
+#   - First argument: Full address of file to check.
+#   - Second argument: Expected checksum of the file to check.
+#   - File name to write result.
+verify-txt-no-comments-no-space = \
   infile=$(strip $(1)); \
   inchecksum=$(strip $(2)); \
+  innobdir=$$(echo $$infile | sed -e's|$(BDIR)/||g'); \
   if ! [ -f "$$infile" ]; then \
     $(call verify-print-error-start); \
     echo "The following file (that should be verified) doesn't exist:"; \
     echo "    $$infile"; \
     echo; exit 1; \
   fi; \
-  checksum=$$(sed -e 's/^[[:space:]]*//g' \
+  checksum=$$(sed -e 's/[[:space:]][[:space:]]*//g' \
                   -e 's/\#.*$$//' \
                   -e '/^$$/d' $$infile \
-	          | md5sum \
-	          | awk '{print $$1}'); \
+                  | md5sum \
+                  | awk '{print $$1}'); \
   if [ x"$$inchecksum" = x"$$checksum" ]; then \
-    echo "Verified: $$infile"; \
+    echo "%% (VERIFIED) $$checksum $$innobdir" >> $(3); \
   else \
     $(call verify-print-error-start); \
     $(call verify-print-tips); \
@@ -105,11 +117,20 @@ $(mtexdir)/verify.tex: $(foreach s, $(verify-dep), $(mtexdir)/$(s).tex)
         # Make sure that verification is actually requested.
 	if [ x"$(verify-outputs)" = xyes ]; then
 
+          # Make sure the temporary output doesn't exist (because we want
+          # to append to it). We are making a temporary output target so if
+          # there is a crash in the middle, Make will not continue. If we
+          # write in the final target progressively, the file will exist,
+          # and its date will be more recent than all prerequisites, so
+          # next time the project is run, Make will continue and ignore the
+          # rest of the checks.
+	  rm -f $@.tmp
+
           # Verify the figure datasets.
-	  $(call verify-txt-no-comments-leading-space, \
-	         $(delete-num), ad345e873e6af577f0e4e7c8942cdf08)
-	  $(call verify-txt-no-comments-leading-space, \
-	         $(delete-histogram), 12a81c4c8c5f552e5ed5686453587fe8)
+	  $(call verify-txt-no-comments-no-space, \
+	         $(dm-squared), 6b6d3b0f9c351de53606507b59bca5d1, $@.tmp)
+	  $(call verify-txt-no-comments-no-space, \
+	         $(dm-img-histogram), b1f9c413f915a1ad96078fee8767b16c, $@.tmp)
 
           # Verify TeX macros (the values that go into the PDF text).
 	  for m in $(verify-check); do
@@ -118,9 +139,11 @@ $(mtexdir)/verify.tex: $(foreach s, $(verify-dep), $(mtexdir)/$(s).tex)
 	    elif [ $$m == delete-me ]; then s=711e2f7fa1f16ecbeeb3df6bcb4ec705
 	    else echo; echo "'$$m' not recognized."; exit 1
 	    fi
-	    $(call verify-txt-no-comments-leading-space, $$file, $$s)
+	    $(call verify-txt-no-comments-no-space, $$file, $$s, $@.tmp)
 	  done
-	fi
 
-        # Make an empty final target.
-	touch $@
+          # Move temporary file to final target.
+	  mv $@.tmp $@
+	else
+	  echo "% Verification was DISABLED!" > $@
+	fi
-- 
cgit v1.2.1