From e623102768c426e86b0ed73904168006dfea2af9 Mon Sep 17 00:00:00 2001 From: Mohammad Akhlaghi Date: Sun, 25 Nov 2018 15:22:48 +0000 Subject: Pipeline now downloads and uses an input dataset In most analysis situations (except for simulations), an input dataset is necessary, but that part of the pipeline was just left out and a general `SURVEY' variable was set and never used. So with this commit, we actually use a sample FITS file from the FITS standard webpage, show it (as well as its histogram) and do some basic calculations on it. This preparation of the input datasets is done in a generic way to enable easy addition of more datasets if necessary. --- reproduce/src/make/delete-me.mk | 71 +++++++++++++++++++++++++++++++++++--- reproduce/src/make/dependencies.mk | 7 +++- reproduce/src/make/download.mk | 57 +++++++++++++++++++++++------- reproduce/src/make/initialize.mk | 9 +++++ 4 files changed, 126 insertions(+), 18 deletions(-) (limited to 'reproduce/src') diff --git a/reproduce/src/make/delete-me.mk b/reproduce/src/make/delete-me.mk index 67f0440..9227fde 100644 --- a/reproduce/src/make/delete-me.mk +++ b/reproduce/src/make/delete-me.mk @@ -25,8 +25,7 @@ # Dummy dataset # ------------- # -# We will use AWK's random number generator to generate a random dataset to -# be imported by PGFPlots for a plot in the paper. +# We will use AWK to generate a table showing X and X^2 and draw its plot. dmdir = $(texdir)/delete-me dm = $(dmdir)/data.txt $(dmdir): | $(texdir); mkdir $@ @@ -43,6 +42,60 @@ $(dm): $(pconfdir)/delete-me-num.mk | $(dmdir) +# WFPC2 image PDF +# ----------------- +# +# For an example image, we'll make a PDF copy of the WFPC II image to +# display in the paper. +wfpc2dir = $(texdir)/delete-me-wfpc2 +$(wfpc2dir): | $(texdir); mkdir $@ +wfpc2 = $(wfpc2dir)/wfpc2.pdf +$(wfpc2): $(indir)/$(WFPC2IMAGE) | $(wfpc2dir) + + # When the plotted values are re-made, it is necessary to also + # delete the TiKZ externalized files so the plot is also re-made. + rm -f $(tikzdir)/delete-me-wfpc2.pdf + + # Convert the dataset to a PDF. + astconvertt --fluxhigh=4 $< -h0 -o$@ + + + + + +# Histogram of WFPC2 image +# ------------------------ +# +# For an example plot, we'll show the pixel value histogram also. +wfpc2hist = $(wfpc2dir)/wfpc2-hist.txt +$(wfpc2hist): $(indir)/$(WFPC2IMAGE) | $(wfpc2dir) + + # When the plotted values are re-made, it is necessary to also + # delete the TiKZ externalized files so the plot is also re-made. + rm -f $(tikzdir)/delete-me-wfpc2.pdf + + # Generate the pixel value distribution + aststatistics --lessthan=5 $< -h0 --histogram -o$@ + + + + + +# Basic statistics +# ---------------- +# +# This is just as a demonstration on how to get analysic configuration +# parameters from variables defined in `reproduce/config/pipeline'. +wfpc2stats = $(wfpc2dir)/wfpc2-stats.txt +$(wfpc2stats): $(indir)/$(WFPC2IMAGE) $(pconfdir)/delete-me-wfpc2-quant.mk \ + | $(wfpc2dir) + aststatistics $< -h0 --mean --median \ + --quantile=$(delete-me-wfpc2-quantile) > $@ + + + + + # TeX macros # ---------- # @@ -50,7 +103,7 @@ $(dm): $(pconfdir)/delete-me-num.mk | $(dmdir) # # NOTE: In LaTeX you cannot use any non-alphabetic character in a variable # name. -$(mtexdir)/delete-me.tex: $(dm) +$(mtexdir)/delete-me.tex: $(dm) $(wfpc2) $(wfpc2hist) $(wfpc2stats) # Write the number of random values used. echo "\newcommand{\deletemenum}{$(delete-me-num)}" > $@ @@ -67,6 +120,16 @@ $(mtexdir)/delete-me.tex: $(dm) {if($$2>max) max=$$2; if($$2> $@; + echo "\newcommand{\deletememin}{$$v}" >> $@ v=$$(echo "$$mm" | awk '{printf "%.3f", $$2}'); echo "\newcommand{\deletememax}{$$v}" >> $@ + + # Write the statistics of the WFPC2 image as a macro. + q=$(delete-me-wfpc2-quantile) + echo "\newcommand{\deletemewfpcquantile}{$$q}" >> $@ + mean=$$(awk '{printf("%.2f", $$1)}' $(wfpc2stats)) + echo "\newcommand{\deletemewfpctwomean}{$$mean}" >> $@ + median=$$(awk '{printf("%.2f", $$2)}' $(wfpc2stats)) + echo "\newcommand{\deletemewfpctwomedian}{$$median}" >> $@ + quantile=$$(awk '{printf("%.2f", $$3)}' $(wfpc2stats)) + echo "\newcommand{\deletemewfpctwoquantile}{$$quantile}" >> $@ diff --git a/reproduce/src/make/dependencies.mk b/reproduce/src/make/dependencies.mk index 8ed359b..a784883 100644 --- a/reproduce/src/make/dependencies.mk +++ b/reproduce/src/make/dependencies.mk @@ -43,7 +43,7 @@ ildir = $(BDIR)/dependencies/installed/lib ilidir = $(BDIR)/dependencies/installed/lib/built # Define the top-level programs to build (installed in `.local/bin'). -top-level-programs = gawk gs grep sed git astnoisechisel texlive-ready +top-level-programs = gawk gs grep sed git flock astnoisechisel texlive-ready all: $(foreach p, $(top-level-programs), $(ibdir)/$(p)) # Other basic environment settings: We are only including the host @@ -75,6 +75,7 @@ LD_LIBRARY_PATH := $(ildir) tarballs = $(foreach t, cfitsio-$(cfitsio-version).tar.gz \ cmake-$(cmake-version).tar.gz \ curl-$(curl-version).tar.gz \ + flock-$(flock-version).tar.xz \ gawk-$(gawk-version).tar.lz \ ghostscript-$(ghostscript-version).tar.gz \ git-$(git-version).tar.xz \ @@ -111,6 +112,7 @@ $(tarballs): $(tdir)/%: w=https://heasarc.gsfc.nasa.gov/FTP/software/fitsio/c/cfitsio$$v.tar.gz elif [ $$n = cmake ]; then w=https://cmake.org/files/v3.12 elif [ $$n = curl ]; then w=https://curl.haxx.se/download + elif [ $$n = flock ]; then w=https://github.com/discoteq/flock/releases/download/v$(flock-version) elif [ $$n = gawk ]; then w=http://ftp.gnu.org/gnu/gawk elif [ $$n = ghostscript ]; then w=https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs926 elif [ $$n = git ]; then w=https://mirrors.edge.kernel.org/pub/software/scm/git @@ -244,6 +246,9 @@ $(ibdir)/libtool: $(tdir)/libtool-$(libtool-version).tar.xz $(ibdir)/gs: $(tdir)/ghostscript-$(ghostscript-version).tar.gz $(call gbuild, $<, ghostscript-$(ghostscript-version)) +$(ibdir)/flock: $(tdir)/flock-$(flock-version).tar.xz + $(call gbuild, $<, flock-$(flock-version), static) + $(ibdir)/git: $(tdir)/git-$(git-version).tar.xz \ $(ilidir)/zlib $(call gbuild, $<, git-$(git-version), static) diff --git a/reproduce/src/make/download.mk b/reproduce/src/make/download.mk index 9617a45..180d2cf 100644 --- a/reproduce/src/make/download.mk +++ b/reproduce/src/make/download.mk @@ -25,20 +25,51 @@ -# Download SURVEY data +# Download input data # -------------------- # -# Data from a survey (for example an imaging survey) usually have a special -# file-name format which should be set here in the `foreach' loop. Note -# that the `foreach' function needs the backslash (`\') at the end of the -# line when it is broken into multiple lines. -all-survey = $(foreach f, $(filters-survey), \ - $(SURVEY)/a-special-format-$(f).fits \ - $(SURVEY)/a-possibly-additional-$(f)-format.fits ) -$(SURVEY):; mkdir $@ -$(all-survey): $(SURVEY)/%: | $(SURVEY) $(lockdir) - flock $(lockdir)/download -c "$(DOWNLOADER) $@ $(web-survey)/$*" +# The input dataset properties are defined in `$(pconfdir)/INPUTS.mk'. For +# this template pipeline we only have one dataset to enable easy +# processing, so all the extra checks in this rule may seem +# redundant. +# +# However, in a real project, you will need more than one dataset. In that +# case, just add them to the target list and add an `elif' statement to +# define it in the recipe. +# +# Download lock file: Most systems have a single connection to the +# internet, therefore downloading is inherently done in series. As a +# result, when more than one dataset is necessary for download, if they are +# done in parallel, the speed will be slower than downloading them in +# series. We thus use the `flock' program to tie/lock the downloading +# process with a file and make sure that only one downloading event is in +# progress at every moment. +$(indir):; mkdir $@ +inputdatasets = $(foreach i, $(WFPC2IMAGE), $(indir)/$(i)) +$(inputdatasets): $(indir)/%: | $(indir) $(lockdir) + + # Set the necessary parameters for this input file. + if [ $* = $(WFPC2IMAGE) ]; then url=$(WFPC2URL); mdf=$(WFPC2MD5); + else + echo; echo; echo "Not recognized input dataset: '$*'." + echo; echo; exit 1 + fi + + # Download (or make the link to) the input dataset. + if [ -f $(INDIR)/$* ]; then + ln -s $(INDIR)/$* $@ + else + flock $(lockdir)/download $(DOWNLOADER) $@ $$url/$* + fi + # Check the md5 sum to see if this is the proper dataset. + sum=$$(md5sum $@ | awk '{print $$1}') + if [ $$sum != $$mdf ]; then + wrongname=$(dir $@)/wrong-$(notdir $@) + mv $@ $$wrongname + echo; echo; echo "Wrong MD5 checksum for '$*' in $$wrongname" + echo; echo; exit 1 + fi @@ -49,5 +80,5 @@ $(all-survey): $(SURVEY)/%: | $(SURVEY) $(lockdir) # # It is very important to mention the address where the data were # downloaded in the final report. -$(mtexdir)/download.tex: $(pconfdir)/web.mk | $(mtexdir) - @echo "\\newcommand{\\websurvey}{$(web-survey)}" > $@ +$(mtexdir)/download.tex: $(pconfdir)/INPUTS.mk | $(mtexdir) + echo "\\newcommand{\\wfpctwourl}{$(WFPC2URL)}" > $@ diff --git a/reproduce/src/make/initialize.mk b/reproduce/src/make/initialize.mk index 694aca0..41a5e05 100644 --- a/reproduce/src/make/initialize.mk +++ b/reproduce/src/make/initialize.mk @@ -34,6 +34,7 @@ # parallel. Also, some programs may not be thread-safe, therefore it will # be necessary to put a lock on them. This pipeline uses the `flock' # program to achieve this. +indir = $(BDIR)/inputs texdir = $(BDIR)/tex srcdir = reproduce/src lockdir = $(BDIR)/locks @@ -224,6 +225,14 @@ $(mtexdir)/initialize.tex: | $(mtexdir) fi; \ echo "\newcommand{\\bziptwoversion}{$(bzip2-version)}" >> $@ + # Unfortunately we couldn't find a way to retrieve the version of + # the discoteq `flock' that we are using here. So we'll just repot + # the version we downloaded and installed. + echo "\newcommand{\\flockversion}{$(flock-version)}" >> $@ + + + + # Versions of libraries. $(call lvcheck, fitsio.h, $(cfitsio-version), CFITSIO, cfitsioversion) -- cgit v1.2.1