diff options
Diffstat (limited to 'reproduce/analysis/make/download.mk')
-rw-r--r-- | reproduce/analysis/make/download.mk | 86 |
1 files changed, 42 insertions, 44 deletions
diff --git a/reproduce/analysis/make/download.mk b/reproduce/analysis/make/download.mk index ea70fca..7110c8f 100644 --- a/reproduce/analysis/make/download.mk +++ b/reproduce/analysis/make/download.mk @@ -5,7 +5,7 @@ # recipes in this Makefile all use a single file lock to have one download # script running at every instant. # -# Copyright (C) 2018-2021 Mohammad Akhlaghi <mohammad@akhlaghi.org> +# Copyright (C) 2018-2022 Mohammad Akhlaghi <mohammad@akhlaghi.org> # # This Makefile is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -27,70 +27,68 @@ # Download input data # -------------------- # -# The input dataset properties are defined in -# `$(pconfdir)/INPUTS.conf'. For this template we only have one dataset to -# enable easy processing, so all the extra checks in this rule may seem -# redundant. +# 'reproduce/analysis/config/INPUTS.conf' contains the input dataset +# properties. In most cases, you will not need to edit this rule (or +# file!). Simply follow the instructions of 'INPUTS.conf' and set the +# variables names according to the described standards. # -# In a real project, you will need more than one dataset. In that case, -# just add them to the target list and add an `elif' statement to define it -# in the recipe. -# -# Files in a server usually have very long names, which are mainly designed -# for helping in data-base management and being generic. Since Make uses -# file names to identify which rule to execute, and the scope of this -# research project is much less than the generic survey/dataset, it is -# easier to have a simple/short name for the input dataset and work with -# that. In the first condition of the recipe below, we connect the short -# name with the raw database name of the dataset. +# TECHNICAL NOTE on the '$(foreach, n ...)' loop of 'inputdatasets': we are +# using several (relatively complex!) features particular to Make: In GNU +# Make, '.VARIABLES' "... expands to a list of the names of all global +# variables defined so far" (from the "Other Special Variables" section of +# the GNU Make manual). Assuming that the pattern 'INPUT-%-sha256' is only +# used for input files, we find all the variables that contain the input +# file name (the '%' is the filename). Finally, using the +# pattern-substitution function ('patsubst'), we remove the fixed string at +# the start and end of the variable name. # # Download lock file: Most systems have a single connection to the # internet, therefore downloading is inherently done in series. As a # result, when more than one dataset is necessary for download, if they are # done in parallel, the speed will be slower than downloading them in -# series. We thus use the `flock' program to tie/lock the downloading +# series. We thus use the 'flock' program to tie/lock the downloading # process with a file and make sure that only one downloading event is in # progress at every moment. $(indir):; mkdir $@ downloadwrapper = $(bashdir)/download-multi-try -inputdatasets = $(indir)/menke20.xlsx +inputdatasets = $(foreach i, \ + $(patsubst INPUT-%-sha256,%, \ + $(filter INPUT-%-sha256,$(.VARIABLES))), \ + $(indir)/$(i)) $(inputdatasets): $(indir)/%: | $(indir) $(lockdir) - # Set the necessary parameters for this input file. - if [ $* = menke20.xlsx ]; then - localname=$(MK20DATA); url=$(MK20URL); mdf=$(MK20MD5); - else - echo; echo; echo "Not recognized input dataset: '$*'." - echo; echo; exit 1 - fi +# Set the necessary parameters for this input file as shell variables +# (to help in readability). + url=$(INPUT-$*-url) + sha=$(INPUT-$*-sha256) - # Download (or make the link to) the input dataset. If the file - # exists in `INDIR', it may be a symbolic link to some other place - # in the filesystem. To avoid too many links when using these files - # during processing, we'll use `readlink -f' so the link we make - # here points to the final file directly (note that `readlink' is - # part of GNU Coreutils). If its not a link, the `readlink' part - # has no effect. +# Download (or make the link to) the input dataset. If the file +# exists in 'INDIR', it may be a symbolic link to some other place in +# the filesystem. To avoid too many links when using these files +# during processing, we'll use 'readlink -f' so the link we make here +# points to the final file directly (note that 'readlink' is part of +# GNU Coreutils). If its not a link, the 'readlink' part has no +# effect. unchecked=$@.unchecked - if [ -f $(INDIR)/$$localname ]; then - ln -fs $$(readlink -f $(INDIR)/$$localname) $$unchecked + if [ -f $(INDIR)/$* ]; then + ln -fs $$(readlink -f $(INDIR)/$*) $$unchecked else touch $(lockdir)/download $(downloadwrapper) "wget --no-use-server-timestamps -O" \ $(lockdir)/download $$url $$unchecked fi - # Check the md5 sum to see if this is the proper dataset. - sum=$$(md5sum $$unchecked | awk '{print $$1}') - if [ $$sum = $$mdf ]; then +# Check the checksum to see if this is the proper dataset. + sum=$$(sha256sum $$unchecked | awk '{print $$1}') + if [ $$sum = $$sha ]; then mv $$unchecked $@ echo "Integrity confirmed, using $@ in this project." else echo; echo; - echo "Wrong MD5 checksum for input file '$$localname':" + echo "Wrong SHA256 checksum for input file '$*':" echo " File location: $$unchecked"; \ - echo " Expected MD5 checksum: $$mdf"; \ - echo " Calculated MD5 checksum: $$sum"; \ + echo " Expected SHA256 checksum: $$sha"; \ + echo " Calculated SHA256 checksum: $$sum"; \ echo; exit 1 fi @@ -104,7 +102,7 @@ $(inputdatasets): $(indir)/%: | $(indir) $(lockdir) # It is very important to mention the address where the data were # downloaded in the final report. $(mtexdir)/download.tex: $(indir)/menke20.xlsx | $(mtexdir) - echo "\newcommand{\menketwentyxlsxname}{$(MK20DATA)}" > $@ - echo "\newcommand{\menketwentychecksum}{$(MK20MD5)}" >> $@ - echo "\newcommand{\menketwentybytesize}{$(MK20SIZE)}" >> $@ - echo "\newcommand{\menketwentyurl}{$(MK20URL)}" >> $@ + echo "\newcommand{\menketwentyxlsxname}{menke20.xlsx}" > $@ + echo "\newcommand{\menketwentychecksum}{$(INPUT-menke20.xlsx-sha256)}" >> $@ + echo "\newcommand{\menketwentybytesize}{$(INPUT-menke20.xlsx-size)}" >> $@ + echo "\newcommand{\menketwentyurl}{$(INPUT-menke20.xlsx-url)}" >> $@ |